From cf2abc8e08dcb9427feadc809a5a148d77b40418 Mon Sep 17 00:00:00 2001 From: fredster33 <64927044+fredster33@users.noreply.github.com> Date: Wed, 10 Mar 2021 15:46:04 -0800 Subject: [PATCH 01/84] Grammar fix (#7622) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ac4ed62524b13..eec5bfd5797d0 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ License Contribute to TVM ----------------- TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community. -Checkout the [Contributor Guide](https://tvm.apache.org/docs/contribute/) +Check out the [Contributor Guide](https://tvm.apache.org/docs/contribute/). Acknowledgement --------------- From dd61b049b57c70a7d1a824420855436850d6cb2d Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Wed, 10 Mar 2021 18:51:54 -0800 Subject: [PATCH 02/84] [TIR] Add PreOrderVisit and VisitPrimFuncs (#7627) * [TIR] Add PreOrderVisit and VisitPrimFuncs * Update stmt_functor.h * address comments * fix lint --- include/tvm/runtime/object.h | 20 ++++----- include/tvm/tir/analysis.h | 16 +++++++ include/tvm/tir/stmt_functor.h | 9 ++++ include/tvm/topi/detail/constant_utils.h | 11 +++-- src/tir/ir/stmt_functor.cc | 54 +++++++++++++++++++++--- tests/cpp/ir_functor_test.cc | 53 +++++++++++++++++++++++ 6 files changed, 142 insertions(+), 21 deletions(-) diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h index 70ab7688c450a..47788394126ed 100644 --- a/include/tvm/runtime/object.h +++ b/include/tvm/runtime/object.h @@ -186,7 +186,7 @@ class TVM_DLL Object { template inline bool IsInstance() const; /*! - * \return Weather the cell has only one reference + * \return Whether the cell has only one reference * \note We use stl style naming to be consistent with known API in shared_ptr. */ inline bool unique() const; @@ -337,7 +337,7 @@ inline RelayRefType GetRef(const ObjectType* ptr); /*! * \brief Downcast a base reference type to a more specific type. * - * \param ref The inptut reference + * \param ref The input reference * \return The corresponding SubRef. * \tparam SubRef The target specific reference type. * \tparam BaseRef the current reference type. @@ -416,7 +416,7 @@ class ObjectPtr { return *get(); } /*! - * \brief copy assignmemt + * \brief copy assignment * \param other The value to be assigned. * \return reference to self. */ @@ -427,7 +427,7 @@ class ObjectPtr { return *this; } /*! - * \brief move assignmemt + * \brief move assignment * \param other The value to be assigned. * \return reference to self. */ @@ -632,7 +632,7 @@ struct ObjectPtrEqual { }; /*! - * \brief helper macro to declare a base object type that can be inheritated. + * \brief helper macro to declare a base object type that can be inherited. * \param TypeName The name of the current type. * \param ParentType The name of the ParentType */ @@ -648,10 +648,10 @@ struct ObjectPtrEqual { return _GetOrAllocRuntimeTypeIndex(); \ } \ static uint32_t _GetOrAllocRuntimeTypeIndex() { \ - static uint32_t tidx = Object::GetOrAllocRuntimeTypeIndex( \ + static uint32_t tindex = Object::GetOrAllocRuntimeTypeIndex( \ TypeName::_type_key, TypeName::_type_index, ParentType::_GetOrAllocRuntimeTypeIndex(), \ TypeName::_type_child_slots, TypeName::_type_child_slots_can_overflow); \ - return tidx; \ + return tindex; \ } /*! @@ -664,7 +664,7 @@ struct ObjectPtrEqual { static const constexpr int _type_child_slots = 0; \ TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType) -/*! \brief helper macro to supress unused warning */ +/*! \brief helper macro to suppress unused warning */ #if defined(__GNUC__) #define TVM_ATTRIBUTE_UNUSED __attribute__((unused)) #else @@ -686,7 +686,7 @@ struct ObjectPtrEqual { TVM_STR_CONCAT(TVM_OBJECT_REG_VAR_DEF, __COUNTER__) = TypeName::_GetOrAllocRuntimeTypeIndex() /* - * \brief Define the default copy/move constructor and assign opeator + * \brief Define the default copy/move constructor and assign operator * \param TypeName The class typename. */ #define TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName) \ @@ -827,7 +827,7 @@ inline bool Object::IsInstance() const { if (!TargetType::_type_child_slots_can_overflow) return false; // Invariance: parent index is always smaller than the child. if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false; - // The rare slower-path, check type hierachy. + // The rare slower-path, check type hierarchy. return self->DerivedFrom(TargetType::RuntimeTypeIndex()); } } else { diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h index e5b2c2b6957c5..1ad78596586a3 100644 --- a/include/tvm/tir/analysis.h +++ b/include/tvm/tir/analysis.h @@ -56,6 +56,22 @@ struct ExprDeepEqual { TVM_DLL bool operator()(const PrimExpr& lhs, const PrimExpr& rhs) const; }; +/*! + * \brief Visit the PrimFuncs in the IRModule + * \tparam FLambda The type of the PrimFunc visitor + * \param mod The IRModule to be visited + * \param fvisit The visitor to the PrimFuncs in the IRModule + */ +template +inline void VisitPrimFuncs(const IRModule& mod, FLambda fvisit) { + for (const auto& kv : mod->functions) { + const BaseFunc& base_func = kv.second; + if (const auto* prim_func = base_func.as()) { + fvisit(prim_func); + } + } +} + /*! * \brief Find undefined vars in the statement. * \param stmt The function to be checked. diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h index d6303ae266e14..c1c618f0c22f2 100644 --- a/include/tvm/tir/stmt_functor.h +++ b/include/tvm/tir/stmt_functor.h @@ -386,6 +386,15 @@ inline T Substitute(T input, const std::unordered_map& return Substitute(std::move(input), vmap); } +/*! + * \brief Recursively visit the IR in pre DFS order node, apply fvisit. + * If fvisit returns false, it won't visit the children of the node. + * \param stmt_or_expr The ir to be visited. + * \param fvisit The visitor function to be applied. If fvisit returns false, it won't visit the + * children of the node + */ +TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr, + const std::function& fvisit); } // namespace tir } // namespace tvm diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h index 92ff3a4e38046..95e68f5f6d618 100644 --- a/include/tvm/topi/detail/constant_utils.h +++ b/include/tvm/topi/detail/constant_utils.h @@ -119,12 +119,11 @@ inline std::vector GetConstInt64Values(Array exprs, } /*! - * \brief Check weather the two expressions are equal or not, if not simplify the expressions and - * check again \note This is stronger equality check than tvm::tir::Equal - * - * \param lhs First expreesion - * \param rhs Second expreesion - * + * \brief Check whether the two expressions are equal or not, if not simplify the expressions and + * check again + * \note This is stronger equality check than tvm::tir::Equal + * \param lhs First expression + * \param rhs Second expression * \return result True if both expressions are equal, else false */ inline bool EqualCheck(PrimExpr lhs, PrimExpr rhs) { diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc index 639d38db0a813..07574e4fb2f13 100644 --- a/src/tir/ir/stmt_functor.cc +++ b/src/tir/ir/stmt_functor.cc @@ -19,12 +19,14 @@ /*! * \file stmt_functor.cc */ +#include #include +#include #include #include -#include "functor_common.h" +#include "./functor_common.h" namespace tvm { namespace tir { @@ -631,9 +633,9 @@ Stmt IRTransform(Stmt ir_node, const runtime::PackedFunc& f_preorder, return transform(std::move(ir_node)); } -class IRSubstitue : public StmtExprMutator { +class IRSubstitute : public StmtExprMutator { public: - explicit IRSubstitue(std::function(const Var&)> vmap) : vmap_(vmap) {} + explicit IRSubstitute(std::function(const Var&)> vmap) : vmap_(vmap) {} PrimExpr VisitExpr_(const VarNode* op) final { Var var = GetRef(op); @@ -679,11 +681,53 @@ class IRSubstitue : public StmtExprMutator { }; Stmt Substitute(Stmt stmt, std::function(const Var&)> vmap) { - return IRSubstitue(vmap)(std::move(stmt)); + return IRSubstitute(vmap)(std::move(stmt)); } PrimExpr Substitute(PrimExpr expr, std::function(const Var&)> vmap) { - return IRSubstitue(vmap)(std::move(expr)); + return IRSubstitute(vmap)(std::move(expr)); +} + +void PreOrderVisit(const ObjectRef& stmt_or_expr, + const std::function& fvisit) { + class PreOrderVisitor : public StmtExprVisitor { + public: + explicit PreOrderVisitor(const std::function& f) : f_(f) {} + + private: + void VisitExpr(const PrimExpr& expr) final { + const PrimExprNode* p_expr = expr.get(); + if (visited_.count(p_expr) == 0) { + visited_.insert(p_expr); + if (f_(expr)) { + ExprVisitor::VisitExpr(expr); + } + } + } + + void VisitStmt(const Stmt& stmt) final { + const StmtNode* p_stmt = stmt.get(); + if (visited_.count(p_stmt) == 0) { + visited_.insert(p_stmt); + if (f_(stmt)) { + StmtVisitor::VisitStmt(stmt); + } + } + } + + const std::function& f_; + std::unordered_set visited_; + }; + + PreOrderVisitor visitor(fvisit); + if (const auto* stmt = stmt_or_expr.as()) { + visitor(GetRef(stmt)); + } else if (const auto* expr = stmt_or_expr.as()) { + visitor(GetRef(expr)); + } else { + LOG(FATAL) << "InternalError: PreOrderVisit does not accept object with type: " + << stmt_or_expr->GetTypeKey(); + } } TVM_REGISTER_GLOBAL("tir.IRTransform").set_body_typed(IRTransform); diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc index 237dc46b99ca3..1f7d18f747eac 100644 --- a/tests/cpp/ir_functor_test.cc +++ b/tests/cpp/ir_functor_test.cc @@ -19,10 +19,14 @@ #include #include +#include #include +#include +#include #include #include #include +#include #include #include @@ -52,6 +56,55 @@ TEST(IRF, CountVar) { ICHECK_EQ(n_var, 2); } +TEST(IRF, VisitPrimFuncs) { + using namespace tvm; + using namespace tvm::tir; + PrimFunc prim_func(/*params=*/{}, /*body=*/Evaluate(Integer(0))); + relay::Function relay_func(/*params=*/{}, /*body=*/relay::Expr(nullptr), + /*ret_type=*/relay::Type{nullptr}, /*ty_params=*/{}); + IRModule mod({ + {GlobalVar("main"), prim_func}, + {GlobalVar("main2"), relay_func}, + }); + int n_visited = 0; + VisitPrimFuncs(mod, [&](const PrimFuncNode* func) { ++n_visited; }); + ASSERT_EQ(n_visited, 1); +} + +TEST(IRF, PreOrderVisit) { + using namespace tvm; + using namespace tvm::tir; + Stmt init = IfThenElse(const_true(), Evaluate(Integer(0)), Evaluate(Integer(0))); + Stmt body = Evaluate(Integer(1)); + Block block(/*iter_vars=*/{}, /*reads=*/{}, + /*writes=*/{}, /*name_hint=*/"block", /*body=*/body, + /*init=*/init); + bool init_visited = false; + bool stopped_at_if = true; + bool body_visited = false; + PreOrderVisit(block, [&](const ObjectRef& n) -> bool { + if (n->IsInstance()) { + init_visited = true; + return false; + } + if (const auto* eval = n.as()) { + if (const auto* int_imm = eval->value.as()) { + if (int_imm->value == 0) { + stopped_at_if = false; + } else if (int_imm->value == 1) { + body_visited = true; + } else { + LOG(FATAL) << "Unreachable"; + } + } + } + return true; + }); + ASSERT_EQ(init_visited, true); + ASSERT_EQ(stopped_at_if, true); + ASSERT_EQ(body_visited, true); +} + TEST(IRF, ExprTransform) { using namespace tvm; using namespace tvm::tir; From c5198632cf583b2d6c357b862d210cac7d6df37b Mon Sep 17 00:00:00 2001 From: liyuchao Date: Thu, 11 Mar 2021 17:02:39 +0800 Subject: [PATCH 03/84] [AutoScheduler] Fix incorrectly array context device and hide info at the beginning (#7632) * [AutoScheduler] Fix incorrectly array context device and hide info at the beginning * Lint fix --- python/tvm/auto_scheduler/measure.py | 12 +++++- .../tvm/auto_scheduler/relay_integration.py | 5 +++ .../unittest/test_auto_scheduler_measure.py | 37 ++++++++++++++++++- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index 959a9c5da82ac..d02dcff3bba09 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -868,7 +868,11 @@ def _timed_eval_func( if arg in tensor_input_map: tensor_name = tensor_input_map[arg] if tensor_name in task_input_names: - args.append(get_task_input_buffer(inp.task.workload_key, tensor_name)) + args.append( + ndarray.array( + get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + ) + ) task_inputs_count += 1 else: raise ValueError( @@ -1079,7 +1083,11 @@ def _timed_rpc_run( if arg in tensor_input_map: tensor_name = tensor_input_map[arg] if tensor_name in task_input_names: - args.append(get_task_input_buffer(inp.task.workload_key, tensor_name)) + args.append( + ndarray.array( + get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + ) + ) task_inputs_count += 1 else: raise ValueError( diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 68f53125c7aef..6cce30f2f5599 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -117,12 +117,17 @@ def extract_tasks( env = TracingEnvironment( TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY ) + + dispatch_ctx = DispatchContext.current + old_verbose = dispatch_ctx.verbose + dispatch_ctx.verbose = 0 with env: # Wrap build call in a new thread to avoid the conflict # between python's multiprocessing and tvm's thread pool build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target)) build_thread.start() build_thread.join() + dispatch_ctx.verbose = old_verbose # create search tasks tasks = [] diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py index 116981028cc99..7605b70be6f49 100644 --- a/tests/python/unittest/test_auto_scheduler_measure.py +++ b/tests/python/unittest/test_auto_scheduler_measure.py @@ -357,7 +357,7 @@ def test_measure_target_host(): @tvm.testing.requires_llvm -def test_measure_special_inputs_map_by_name(): +def test_measure_special_inputs_map_by_name_local_runner(): @auto_scheduler.register_workload def foo(): X = te.placeholder(shape=[10], dtype="int32") @@ -384,6 +384,38 @@ def foo(): assert mress[0].error_no == 0 +@tvm.testing.requires_llvm +def test_measure_special_inputs_map_by_name_rpc_runner(): + @auto_scheduler.register_workload + def foo(): + X = te.placeholder(shape=[10], dtype="int32") + Index = te.placeholder(shape=[1], dtype="int32", name="Index") + Y = te.compute((1,), lambda i: X[Index[i]]) + return [X, Index, Y] + + # This workload cannot use random input for the `Index` input + task = auto_scheduler.SearchTask( + func=foo, + target="llvm", + task_inputs={ + "Index": tvm.nd.array(np.array([5], dtype="int32")), + }, + ) + + for enable_cpu_cache_flush in [True, False]: + minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) + local_builder = auto_scheduler.LocalBuilder() + measure_ctx = auto_scheduler.LocalRPCMeasureContext( + timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush + ) + rpc_runner = measure_ctx.runner + + bress = local_builder.build([minp]) + assert bress[0].error_no == 0 + mress = rpc_runner.run([minp], bress) + assert mress[0].error_no == 0 + + if __name__ == "__main__": test_record_split_reorder_fuse_annotation() test_record_compute_at_root_inline_cache_read_write() @@ -395,4 +427,5 @@ def foo(): test_dag_measure_local_builder_runner() test_measure_local_builder_rpc_runner() test_measure_target_host() - test_measure_special_inputs_map_by_name() + test_measure_special_inputs_map_by_name_local_runner() + test_measure_special_inputs_map_by_name_rpc_runner() From df6fb6938e1a816739565c03d8e079a811515031 Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Thu, 11 Mar 2021 06:36:29 -0800 Subject: [PATCH 04/84] [MIPS] Fix CALL16 reloc at 0x290 not against global symbol (#7634) --- src/target/llvm/codegen_cpu.cc | 7 +++++-- src/tir/transforms/make_packed_api.cc | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index b37cd73ece04f..b49f850b2d906 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -437,11 +437,14 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) { arg_types.push_back(value->getType()); } llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, arg_types, false); + // $xxx_compute_ functions are not global. They should be marked as static (via InternalLinkage) + // to call them correctly on MIPS platform (CALL16 issue) + // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol llvm::Function* fcompute = llvm::Function::Create( - ftype, llvm::Function::PrivateLinkage, + ftype, llvm::Function::InternalLinkage, op->value.as()->value.operator llvm::StringRef(), module_.get()); BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values)); - // setup compute fuinction. + // setup compute function. std::unordered_map new_vmap; size_t idx = 0; for (auto it = fcompute->arg_begin(); it != fcompute->arg_end(); ++it, ++idx) { diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 0946af6f640af..3842f3e9a8ee5 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -229,7 +229,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { // // For example, for auto broadcasting, checks are required to guarantee that // either 0 or the original stride will be correctly used. Checks here have - // to use the args that may have no let bining yet. Therefore, hoisting let + // to use the args that may have no let binding yet. Therefore, hoisting let // binding for args before buffer declaration is needed. for (const auto& kv : var_def) { binder.Bind(kv.second, kv.first, kv.first->name_hint, true); From 56feab9f4d97f310018d6a1df6ed4d5dd75e9178 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Thu, 11 Mar 2021 22:41:50 +0800 Subject: [PATCH 05/84] [Test] Add Test Case to Cover Bug Fix by PR#7432 (#7601) --- tests/python/relay/test_pass_auto_quantize.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py index 8a7c4cbfbbd6a..31f5ac6e71b1d 100644 --- a/tests/python/relay/test_pass_auto_quantize.py +++ b/tests/python/relay/test_pass_auto_quantize.py @@ -307,6 +307,39 @@ def @main( verify_partition_fails(mod, params) +def test_left_shift_negative(): + data = relay.var("data", shape=(1, 16, 64, 64)) + weight = relay.const(np.full((16, 16, 3, 3), 256.0)) + conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=16) + relu = relay.nn.relu(conv2d) + + mod = tvm.IRModule.from_expr(relu) + + with tvm.transform.PassContext(opt_level=3): + with relay.quantize.qconfig( + calibrate_mode="global_scale", global_scale=8.0, skip_conv_layers=None + ): + qnn_mod = relay.quantize.quantize(mod) + + class OpFinder(relay.ExprVisitor): + def __init__(self, op_name): + super(OpFinder, self).__init__() + self._op_name = op_name + self.ops = list() + + def visit_call(self, call): + super().visit_call(call) + if call.op.name == self._op_name: + self.ops.append(call) + + opf = OpFinder("left_shift") + opf.visit(qnn_mod["main"]) + assert len(opf.ops) > 0, 'Broken case, can\'t find any "left_shift" operators.' + for left_shift_op in opf.ops: + shift_amount = left_shift_op.args[1].data.asnumpy() + assert shift_amount >= 0, "Shift amount must be non-negative." + + if __name__ == "__main__": test_mul_rewrite() test_batch_flatten_rewrite() @@ -320,3 +353,4 @@ def @main( test_unquantizable_prefix_partition() test_unquantizable_core_partition() test_unquantizable_suffix_partition() + test_left_shift_negative() From f8596b5228346eae18f25f22e3096fef758b1166 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 12 Mar 2021 00:56:28 +0900 Subject: [PATCH 06/84] [ONNX] Use take instead of min in NMS conditions (#7633) --- python/tvm/relay/frontend/onnx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 860753d6cd0b6..f31b8c927f8f1 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -2453,7 +2453,7 @@ def _first_cond( nms_size_out, ): # Loop over classes, end when i == C - return _op.min(_op.less(i, C)) + return _op.take(_op.less(i, C), _expr.const(0)) def _first_body( i, @@ -2561,7 +2561,7 @@ def _first_body( def _inner_cond(i, j, C, onnx_out, nms_size, out): # inner loop over number of classes - return _op.min(_op.less(j, C)) + return _op.take(_op.less(j, C), _expr.const(0)) def _inner_body(i, j, C, onnx_out, nms_size, out): # slice to get current batch and class for valid box indicator @@ -2591,7 +2591,7 @@ def _inner_body(i, j, C, onnx_out, nms_size, out): def _outer_cond(i, B, C, onnx_out, nms_size_out, out): # Outer loop is over batch size - return _op.min(_op.less(i, B)) + return _op.take(_op.less(i, B), _expr.const(0)) def _outer_body(i, B, C, onnx_out, nms_size_out, out): # Outer loop just calls inner loop From b2a3c481ebbb7cfbd5335fb11cd516ae5f348406 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 12 Mar 2021 04:42:47 +0900 Subject: [PATCH 07/84] [Ansor] Add HW param for Vulkan tuning (#7626) * add HW param for VK * query warp size properly * guard against warp_size < 4 case Co-authored-by: Masahiro Masuda --- src/auto_scheduler/search_task.cc | 23 +++++++++++++++++++++++ src/runtime/vulkan/vulkan.cc | 25 ++++++++++++++++--------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index 22c2893141cf2..f25e581dbf24a 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -106,6 +106,29 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target auto target_device = target->GetAttr("device", ""); LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device; } + } else if (device_type == kDLVulkan) { + auto ctx = TVMContext{static_cast(device_type), 0}; + auto device_name = "device_api.vulkan"; + auto func = tvm::runtime::Registry::Get(device_name); + ICHECK(func != nullptr) << "Cannot find Vulkan device_api in registry"; + auto device_api = static_cast(((*func)()).operator void*()); + + tvm::runtime::TVMRetValue ret; + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); + int max_shared_memory_per_block = ret; + + int max_local_memory_per_block = INT32_MAX; + + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); + int max_threads_per_block = ret; + + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); + int warp_size = ret; + + int max_vthread_extent = std::max(1, warp_size / 4); + + return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block, + max_threads_per_block, max_vthread_extent, warp_size); } else { LOG(FATAL) << "No default hardware parameters for target: " << target; } diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 794f3c570f962..ff1b82f930d73 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -367,28 +367,37 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* } ICHECK_LT(index, context_.size()) << "Invalid device id " << index; const auto& vctx = context(index); + VkPhysicalDeviceProperties phy_prop; + vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); + switch (kind) { case kMaxThreadsPerBlock: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.limits.maxComputeWorkGroupInvocations; *rv = value; break; } case kMaxSharedMemoryPerBlock: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.limits.maxComputeSharedMemorySize; *rv = value; break; } case kWarpSize: { - *rv = 1; + VkPhysicalDeviceSubgroupProperties subgroup_prop; + subgroup_prop.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + subgroup_prop.pNext = NULL; + + VkPhysicalDeviceProperties2 phy_prop2; + phy_prop2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + phy_prop2.pNext = &subgroup_prop; + + vkGetPhysicalDeviceProperties2(vctx.phy_device, &phy_prop2); + int64_t subgroup_size = subgroup_prop.subgroupSize; + ICHECK(subgroup_size >= 1); + + *rv = subgroup_size; break; } case kComputeVersion: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.apiVersion; std::ostringstream os; os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "." @@ -405,8 +414,6 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* case kExist: break; case kMaxThreadDimensions: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t dims[3]; dims[0] = phy_prop.limits.maxComputeWorkGroupSize[0]; dims[1] = phy_prop.limits.maxComputeWorkGroupSize[1]; From 017ff94d15df85ea8476f8ad3ce234470072ae84 Mon Sep 17 00:00:00 2001 From: ANSHUMAN TRIPATHY Date: Fri, 12 Mar 2021 02:31:59 +0530 Subject: [PATCH 08/84] [TOPI] Sparse Add Op added (#7435) * [TOPI] Sparse Add Op added * lint resolved * TF frontend support added * Test case added * [1] Review comment handled * [2] Review comment handled * [3] Review comment handled * [4] Review comment handled * [5] Review comment handled --- python/tvm/relay/frontend/tensorflow.py | 35 ++++++++++ python/tvm/relay/op/nn/_nn.py | 5 ++ python/tvm/relay/op/nn/nn.py | 47 +++++++++++++ python/tvm/relay/op/strategy/generic.py | 23 +++++++ python/tvm/topi/nn/sparse.py | 69 +++++++++++++++++++ src/relay/op/nn/sparse.cc | 41 +++++++++++ .../frontend/tensorflow/test_forward.py | 48 +++++++++++++ tests/python/topi/python/test_topi_sparse.py | 28 ++++++++ 8 files changed, 296 insertions(+) diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index c79c495b03604..f56d187b6a632 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -1286,6 +1286,40 @@ def _impl(inputs, attr, params, mod): return _impl +def _sparse_tensor_dense_add(): + # Sparse utility from scipy + from scipy.sparse import csr_matrix + + def _impl(inputs, attr, params, mod): + assert ( + len(inputs) == 4 + ), "There should be 4 input tensors [sparse_indices, sparse_values, sparse_shape, dense]." + + indices_tensor = _infer_value(inputs[0], params, mod).asnumpy() + values_tensor = _infer_value(inputs[1], params, mod).asnumpy() + dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy() + + data = inputs[3] + + rows = [x[0] for x in indices_tensor] + cols = [x[1] for x in indices_tensor] + + # Create scipy sparse Tensor(CSR) + weight_sp = csr_matrix( + (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist()) + ) + + weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype) + weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype) + weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype) + + ret = _op.nn.sparse_add(data, [weight_data, weight_indices, weight_indptrs]) + + return ret + + return _impl + + def _identity(): def _impl(inputs, attr, params, mod): return inputs[0] @@ -2787,6 +2821,7 @@ def _impl(inputs, attr, params, mod): "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(), "SparseSegmentMean": _sparse_segment_mean(), "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(), + "SparseTensorDenseAdd": _sparse_tensor_dense_add(), "Split": _split(False), "SplitV": _split(True), "Sqrt": AttrCvt("sqrt"), diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 6ae86c0786e52..af64873ee9049 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -142,6 +142,11 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type): return topi.nn.sparse_dense_alter_layout(attrs, inputs, tinfos, out_type) +# sparse_add +reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy) +reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE) + + @reg.register_compute("nn.internal.sparse_dense_padded") def compute_sparse_dense_padded(attrs, inputs, out_type): """Compute definition of sparse_dense_padded""" diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 5135ac74de250..a1147fec4d7eb 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -2148,6 +2148,53 @@ def sparse_transpose(x): return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3) +# pylint: disable=no-else-return,inconsistent-return-statements +def sparse_add(dense_mat, sparse_mat): + r""" + Computes the matrix addition of `dense_mat` and `sparse_mat`, where `dense_mat` is + a dense matrix and `sparse_mat` is a sparse (CSR) namedtuple with + fields `data`, `indices`, and `indptr`. + + .. math:: + + \mbox{sparse_add}(dense_mat, sparse_mat)[m, n] = \mbox{add}(\mbox{as_dense}(S), (D))[m, n] + + where `as_dense` returns dense equivalent of the given S(sparse matrix) + while performing addition with given D(dense matrix). + + Parameters + ---------- + dense_mat : tvm.relay.Expr + The input dense matrix for the matrix addition + + sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]]. + The input sparse matrix(CSR) for the matrix addition. + + Returns + ------- + result: tvm.relay.Expr + The computed result. + + Examples + ------- + .. code-block:: python + dense_data = [[ 3., 4., 4. ] + [ 4., 2., 5. ]] + sparse_data = [4., 8.] + sparse_indices =[0, 2] + sparse_indptr =[0, 1, 2] + + output = relay.sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr) + + output = [[ 7., 4., 4. ] + [ 4., 2., 13. ]] + """ + if hasattr(sparse_mat, "indices"): + return _make.sparse_add(dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr) + else: + return _make.sparse_add(dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2]) + + def contrib_conv2d_winograd_without_weight_transform( data, weight, diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index be86ea9d9184c..04f25640574a5 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -799,6 +799,29 @@ def sparse_dense_padded_strategy(attrs, inputs, out_type, target): raise NotImplementedError("sparse_dense_padded is only implemented for cuda") +# sparse_add +def wrap_compute_sparse_add(topi_compute): + """wrap sparse add topi compute""" + + def _compute_sparse_add(attrs, inputs, out_type): + return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3])] + + return _compute_sparse_add + + +@override_native_generic_func("sparse_add_strategy") +def sparse_add_strategy(attrs, inputs, out_type, target): + """sparse add generic strategy""" + logger.warning("sparse add is not optimized for this platform.") + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_sparse_add(topi.nn.sparse_add), + wrap_topi_schedule(topi.generic.schedule_extern), + name="sparse_add.generic", + ) + return strategy + + # sparse_transpose @generic_func def schedule_sparse_transpose(attrs, outs, target): diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py index 1bf18df09da3c..756110624aa19 100644 --- a/python/tvm/topi/nn/sparse.py +++ b/python/tvm/topi/nn/sparse.py @@ -468,3 +468,72 @@ def _traverse(t): sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr" return sparse_input_map + + +def sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr): + """ + Computes sparse-dense addition + + Parameters + ---------- + dense_data : tvm.te.Tensor + 2-D with shape [M, N] + + sparse_data : tvm.te.Tensor + 1-D with shape [nnz] (CSR) + + sparse_indices : tvm.te.Tensor + 1-D with shape [nnz] (CSR) + + sparse_indptr : tvm.te.Tensor + 1-D with shape [M + 1] (CSR) + + Returns + ------- + output : tvm.te.Tensor + 2-D with shape [M, N] + """ + # TODO(ANSHUMAN87): support BSR format too + assert len(sparse_data.shape) == 1, "only CSR format is supported" + return _sparse_add_csr(dense_data, sparse_data, sparse_indices, sparse_indptr) + + +def _sparse_add_csr(dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp): + oshape = get_const_tuple(dense_data_inp.shape) + + def _csr_add_ir(dense_data, sparse_data, sparse_indices, sparse_indptr, out_data): + irb = tvm.tir.ir_builder.create() + dense_data_ptr = irb.buffer_ptr(dense_data) + sparse_data_ptr = irb.buffer_ptr(sparse_data) + sparse_indices_ptr = irb.buffer_ptr(sparse_indices) + sparse_indptr_ptr = irb.buffer_ptr(sparse_indptr) + + out_data_ptr = irb.buffer_ptr(out_data) + + with irb.for_range(0, oshape[0], kind="vectorize", name="row") as row: + with irb.for_range(0, oshape[1], kind="parallel", name="col") as col: + out_data_ptr[row, col] = dense_data_ptr[row, col] + + with irb.for_range(0, oshape[0], kind="parallel", name="row") as row: + offset = sparse_indptr_ptr[row] + diff = sparse_indptr_ptr[row + 1] - sparse_indptr_ptr[row] + with irb.for_range(0, diff, kind="serial", name="idx") as idx: + real_idx = offset + idx + col = sparse_indices_ptr[real_idx] + out_data_ptr[row, col] = sparse_data_ptr[real_idx] + out_data_ptr[row, col] + + return irb.get() + + return te.extern( + shape=oshape, + inputs=[dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp], + fcompute=lambda ins, outs: _csr_add_ir(ins[0], ins[1], ins[2], ins[3], outs[0]), + tag="sparse_add_csr", + dtype=[ + dense_data_inp.dtype, + sparse_data_inp.dtype, + sparse_indices_inp.dtype, + sparse_indptr_inp.dtype, + ], + name="sparse_add_csr_output", + ) diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc index 6322cfffd7c22..b1a16f18b6237 100644 --- a/src/relay/op/nn/sparse.cc +++ b/src/relay/op/nn/sparse.cc @@ -196,5 +196,46 @@ RELAY_REGISTER_OP("nn.sparse_transpose") .set_support_level(1) .add_type_rel("SparseTranspose", SparseTransposeRel); +// relay.nn.sparse_add +bool SparseAddRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + ICHECK_EQ(types.size(), 5) << "expecting 4 inputs and 1 output."; + const auto* dense_data = types[0].as(); + const auto* sparse_data = types[1].as(); + ICHECK(reporter->Assert(sparse_data->dtype == dense_data->dtype)) + << "sparse tensor and dense tensor datatype should match."; + ICHECK(reporter->Assert(sparse_data->shape.size() == 1)) << "sparse data tensor should be 1D."; + const auto* sparse_indices = types[2].as(); + ICHECK(reporter->Assert(sparse_indices->shape.size() == 1)) + << "sparse indices tensor should be 1D."; + + reporter->Assign(types[4], TensorType(dense_data->shape, dense_data->dtype)); + return true; +} + +Expr MakeSparseAdd(Expr dense_data, Expr sparse_data, Expr sparse_indices, Expr sparse_indptr) { + static const Op& op = Op::Get("nn.sparse_add"); + return Call(op, {dense_data, sparse_data, sparse_indices, sparse_indptr}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_add").set_body_typed(MakeSparseAdd); + +RELAY_REGISTER_OP("nn.sparse_add") + .describe(R"code(Add a dense matrix X with sparse matrix Y. + +- **dense**: `(M, N)` +- **sparse**: `(M, N)` + +- **out**: `(M, N)`. + +)code" TVM_ADD_FILELINE) + .set_num_inputs(4) + .add_argument("dense_data", "2D Tensor", "Dense data matrix.") + .add_argument("sparse_data", "1D Tensor", "Sparse data vector.") + .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.") + .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.") + .set_support_level(1) + .add_type_rel("SparseAdd", SparseAddRel); + } // namespace relay } // namespace tvm diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 81aeb5ef886c3..fa27dee37699b 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -2352,6 +2352,54 @@ def test_forward_sparse_to_dense_v2(): _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32", 1.9) +####################################################################### +# tensorflow.sparse.add +# ---------------------------------- + + +def _test_sparse_add(indices, values, A_shape, B_shape, dtype, flip=False): + """ One iteration of tf.sparse.add """ + + # TODO(ANSHUMAN87): support cuda + # TODO(ANSHUMAN87): support both sparse input case + + with tf.Graph().as_default(): + A_sp = tf.sparse.SparseTensor( + indices=indices, values=np.array(values).astype(dtype), dense_shape=A_shape + ) + B = tf.placeholder(shape=B_shape, dtype=dtype, name="B") + + # TODO(ANSHUMAN87): support user input threashold values + if flip: + result = tf.sparse.add(B, A_sp, threshold=0) + else: + result = tf.sparse.add(A_sp, B, threshold=0) + + B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype) + + compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True) + + +def test_sparse_add(): + """ sparse.add op test""" + ################################################################### + # + # In order to create a SparseTensor, it requires 3 input as below: + # SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) + # + # Above Sparse can be represented in Dense as below : + # [[1, 0, 0, 0] + # [0, 0, 2, 0] + # [0, 0, 0, 0]] + # + # ------------------------------------------------------------------ + for dtype_inp in ["float32", "float64", "int32"]: + _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp) + _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp, True) + _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp) + _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp, True) + + ####################################################################### # StridedSlice # ------------ diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py index d5bd7aa1a21ec..d84bd1530587f 100644 --- a/tests/python/topi/python/test_topi_sparse.py +++ b/tests/python/topi/python/test_topi_sparse.py @@ -526,6 +526,33 @@ def test_sparse_dense_padded_alter_op(): x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda")) +def test_sparse_add_csr(): + for indices_dtype in ["int32", "int64"]: + for data_dtype in ["float32", "float64"]: + M, K, density = 3, 49, 0.2 + X_np = np.random.randn(M, K).astype(data_dtype) + Y_sp_np = sp.random(M, K, density=density, format="csr", dtype=data_dtype) + Y_np = Y_sp_np.todense() + Z_np = X_np + Y_np + + Y_data = te.placeholder(shape=Y_sp_np.data.shape, dtype=data_dtype) + Y_indices = te.placeholder(shape=Y_sp_np.indices.shape, dtype=indices_dtype) + Y_indptr = te.placeholder(shape=Y_sp_np.indptr.shape, dtype=indices_dtype) + X = te.placeholder(shape=X_np.shape, dtype=data_dtype) + Z = topi.nn.sparse_add(X, Y_data, Y_indices, Y_indptr) + s = te.create_schedule(Z.op) + func = tvm.build(s, [X, Y_data, Y_indices, Y_indptr, Z]) + Z_tvm = tvm.nd.array(np.zeros(Z_np.shape, dtype=Z_np.dtype)) + func( + tvm.nd.array(X_np.astype(data_dtype)), + tvm.nd.array(Y_sp_np.data.astype(data_dtype)), + tvm.nd.array(Y_sp_np.indices.astype(indices_dtype)), + tvm.nd.array(Y_sp_np.indptr.astype(indices_dtype)), + Z_tvm, + ) + tvm.testing.assert_allclose(Z_tvm.asnumpy(), Z_np, atol=1e-4, rtol=1e-4) + + if __name__ == "__main__": test_csrmv() test_csrmm() @@ -537,3 +564,4 @@ def test_sparse_dense_padded_alter_op(): test_sparse_dense_padded_alter_op() test_sparse_dense_csr_reverse() test_sparse_dense_bsr_reverse() + test_sparse_add_csr() From e9e014b4685ea05885cb321f1e42d2d3f71a0407 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Thu, 11 Mar 2021 14:53:48 -0800 Subject: [PATCH 09/84] [Relay][QNN] Simulated Quantize and Dequantize (#7613) * Add initial implementation of flexible simulated qnn ops. * Added proper topi testing and fixed qnn axis bug. * Add injective schedule wrapping. * Stuck on typerel problem. * Relay integration fully working. * Simulated quantize totally finished. * Change dtype to be a scalar rather than tensor. * Undo change to quantize. * formatting. * Fix attritubes. * Fix negative axis dequantize bug. * Add topi simulated dequantize. * Add simulated_dequantize op to topi and relay. * Formatting. * Test negative axis perchannel dequantization. * Lint formatting. * Change import order to make lint happy. * Fix pytest. * Directly return make call. * Clarify disable mode for simulated qnn ops and fix typos. * Line too long oops. Co-authored-by: Ubuntu --- include/tvm/relay/qnn/attrs.h | 12 ++ python/tvm/relay/qnn/op/__init__.py | 2 +- python/tvm/relay/qnn/op/_qnn.py | 52 +++++ python/tvm/relay/qnn/op/qnn.py | 72 ++++++- python/tvm/topi/nn/__init__.py | 1 + python/tvm/topi/nn/qnn.py | 190 ++++++++++++++++++ src/relay/qnn/op/dequantize.cc | 9 +- src/relay/qnn/op/quantize.cc | 13 +- src/relay/qnn/op/simulated_dequantize.cc | 80 ++++++++ src/relay/qnn/op/simulated_quantize.cc | 82 ++++++++ tests/python/relay/test_op_qnn_dequantize.py | 2 +- tests/python/relay/test_op_qnn_quantize.py | 2 +- .../relay/test_op_qnn_simulated_dequantize.py | 177 ++++++++++++++++ .../relay/test_op_qnn_simulated_quantize.py | 178 ++++++++++++++++ tests/python/topi/python/test_topi_qnn.py | 157 +++++++++++++++ 15 files changed, 1019 insertions(+), 10 deletions(-) create mode 100644 python/tvm/relay/qnn/op/_qnn.py create mode 100644 python/tvm/topi/nn/qnn.py create mode 100644 src/relay/qnn/op/simulated_dequantize.cc create mode 100644 src/relay/qnn/op/simulated_quantize.cc create mode 100644 tests/python/relay/test_op_qnn_simulated_dequantize.py create mode 100644 tests/python/relay/test_op_qnn_simulated_quantize.py create mode 100644 tests/python/topi/python/test_topi_qnn.py diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h index c5213fe074713..f0280a90c604f 100644 --- a/include/tvm/relay/qnn/attrs.h +++ b/include/tvm/relay/qnn/attrs.h @@ -75,6 +75,18 @@ struct QuantizeAttrs : public tvm::AttrsNode { } }; +struct SimulatedQuantizeAttrs : public tvm::AttrsNode { + int axis; + + TVM_DECLARE_ATTRS(SimulatedQuantizeAttrs, "relay.attrs.SimulatedQuantizeAttrs") { + TVM_ATTR_FIELD(axis) + .describe( + "The output channel axis for channel wise quantization. Default value is -1," + "which corresponds to the last axis.") + .set_default(-1); + } +}; + /*! \brief Attribute for dequantize operator */ struct DequantizeAttrs : public tvm::AttrsNode { int axis; diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py index 6d66e12eeafc1..848409360a9de 100644 --- a/python/tvm/relay/qnn/op/__init__.py +++ b/python/tvm/relay/qnn/op/__init__.py @@ -19,4 +19,4 @@ from __future__ import absolute_import as _abs from .qnn import * from .op import register_qnn_legalize -from . import legalizations, layout_conversions +from . import _qnn, legalizations, layout_conversions diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py new file mode 100644 index 0000000000000..a059c293a0f83 --- /dev/null +++ b/python/tvm/relay/qnn/op/_qnn.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument, len-as-condition +"""QNN operator feature registration""" + +from tvm import topi + +from ...op.op import register_compute +from ...op.op import register_injective_schedule +from ...op.op import register_pattern, OpPattern + + +@register_compute("qnn.simulated_quantize") +def simulated_quantize_compute(attrs, inputs, output_type): + assert len(inputs) == 4 + return [ + topi.nn.simulated_quantize( + inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") + ) + ] + + +register_injective_schedule("qnn.simulated_quantize") +register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE) + + +@register_compute("qnn.simulated_dequantize") +def simulated_dequantize_compute(attrs, inputs, output_type): + assert len(inputs) == 4 + return [ + topi.nn.simulated_dequantize( + inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") + ) + ] + + +register_injective_schedule("qnn.simulated_dequantize") +register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE) diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index a5892f331f060..f02f8227e14a2 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -18,8 +18,10 @@ """QNN dialect operators.""" from __future__ import absolute_import as _abs +from tvm import relay from tvm.relay.expr import Tuple, TupleWrapper from tvm.relay.op.nn.utils import get_pad_tuple2d +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE from . import _make from ... import op as reg from ...op import OpPattern @@ -118,6 +120,40 @@ def quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"): return _make.quantize(data, output_scale, output_zero_point, axis, out_dtype) +def simulated_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"): + r"""Simulated Quantize op + Mimics the quantize op but has more flexibility in valid inputs and always + outputs the same type as the input. This can be useful for + calibrating or training a quantized network. + + Parameters + ---------- + data : tvm.relay.Expr + The input tensor to be quantized. Can be of type float32. + output_zero_point : tvm.relay.Expr + The output zero_point. + output_scale : tvm.relay.Expr + The output scale. + axis : int + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + out_dtype : string or tvm.relay.Expr + A string or tensor indicating which datatype to quantize to. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + # Convert string dtype to a constant if needed. + if isinstance(out_dtype, str): + type_code = SQNN_DTYPE_TO_CODE[out_dtype] + out_dtype = relay.const(type_code, dtype="int32") + # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility. + output_scale = relay.op.reshape(output_scale, [-1]) + output_zero_point = relay.op.reshape(output_zero_point, [-1]) + return _make.simulated_quantize(data, out_dtype, output_scale, output_zero_point, axis) + + def dequantize(data, input_scale, input_zero_point, axis=-1): r"""Dequantize op This operator takes quantized int8 and unit8 as input and produces @@ -127,7 +163,7 @@ def dequantize(data, input_scale, input_zero_point, axis=-1): Parameters ---------- data : tvm.relay.Expr - The input tensor to be dequantized. Can be of type [int8, uint8]. + The input tensor to be dequantized. Can be of type [int8, uint8, int32]. input_zero_point : tvm.relay.Expr The input zero_point. input_scale : tvm.relay.Expr @@ -143,6 +179,40 @@ def dequantize(data, input_scale, input_zero_point, axis=-1): return _make.dequantize(data, input_scale, input_zero_point, axis) +def simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype="int8"): + r"""Simulated Dequantize op + Mimics the dequantize op but has more flexibility in valid inputs and always + outputs the same type as the input. This can be useful for calibrating or + training a quantized network. + + Parameters + ---------- + data : tvm.relay.Expr + The input tensor to be dequantized. + input_zero_point : tvm.relay.Expr + The input zero_point. + input_scale : tvm.relay.Expr + The input scale. + axis : int + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + in_dtype : string or tvm.relay.Expr + A string or tensor indicating which datatype to dequantize from. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + # Convert string dtype to a constant if needed. + if isinstance(in_dtype, str): + type_code = SQNN_DTYPE_TO_CODE[in_dtype] + in_dtype = relay.const(type_code, dtype="int32") + # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility. + input_scale = relay.op.reshape(input_scale, [-1]) + input_zero_point = relay.op.reshape(input_zero_point, [-1]) + return _make.simulated_dequantize(data, in_dtype, input_scale, input_zero_point, axis) + + def concatenate(data, input_scales, input_zero_points, output_scale, output_zero_point, axis): """Concatenate the quantized input tensors along the given axis. diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py index 2ebbd1d67bd18..94a5b30c9b760 100644 --- a/python/tvm/topi/nn/__init__.py +++ b/python/tvm/topi/nn/__init__.py @@ -36,6 +36,7 @@ from .conv2d_transpose import * from .conv1d_transpose import * from .bnn import * +from .qnn import * from .upsampling import * from .local_response_norm import * from .bitserial_conv2d import * diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py new file mode 100644 index 0000000000000..caed285800379 --- /dev/null +++ b/python/tvm/topi/nn/qnn.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Quantized Neural Network (QNN) Operators""" +import tvm +from tvm import te, tir, topi + +SQNN_DISABLE = 0 +SQNN_INT8 = 1 +SQNN_UINT8 = 2 +SQNN_INT32 = 3 + +SQNN_DTYPE_TO_CODE = { + "disable": SQNN_DISABLE, + "int8": SQNN_INT8, + "uint8": SQNN_UINT8, + "int32": SQNN_INT32, +} + +SQNN_CODE_TO_DTYPE = {v: k for k, v in SQNN_DTYPE_TO_CODE.items()} + + +@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) +def simulated_quantize(data, out_dtype, output_scale=None, output_zero_point=None, axis=-1): + """Simulated QNN quantize operator that mimics QNN outputs without changing datatype. + The benefit of this operator over true QNN quantize is that this operator allows dynamic + datatype selection and can operate on both per-channel and scalar scales and zero points while + QNN quantize requires both of these to be fixed at compile time. + + Parameters + ---------- + data: tvm.te.Tensor + An N-D input tensor to the operator. + + out_dtype: tvm.te.Tensor + A scalar variable that indicates which datatype to simulate quantization with. Use + SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable + value. + + output_scale: tvm.te.Tensor, optional + A scalar tensor representing the scale to use when quantizing to integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + output_zero_point: tvm.te.Tensor, optional + A 1-D tensor representing the zero point to use when quantizing to integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + axis: int, optional + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + + """ + # When disabled, just pass through the input values. + def _compute_pass_through(value, *indices): + return value[indices] + + # Simulate quantization for arbitrary integer datatypes. The computation for all datatypes is: + # Q_output = clip((round(input_tensor/output_scale) + output_zero_point), + # out_dtype::min, + # out_dtype::max) + def _compute_intn(dtype, value, *indices): + assert output_scale is not None and output_zero_point is not None + const_min = tvm.tir.min_value(dtype) + const_max = tvm.tir.max_value(dtype) + # Use indexmod to handle both scalar and per-channel QNN parameters. + scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0]) + zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0]) + return te.max( + te.min( + te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx], + const_max, + ), + const_min, + ) + + # Use an if chain to dynamically return the proper quantization based on the input datatype. + # This allows the op to compile once but apply different quantization approaches + # using a variable datatype input. + def _dispatch_sim_quantize(value): + pass_through_value = te.compute( + data.shape, lambda *indices: _compute_pass_through(value, *indices) + ) + int8_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), + _compute_intn("int8", value, *indices), + pass_through_value[indices], + ), + ) + uint8_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), + _compute_intn("uint8", value, *indices), + int8_value[indices], + ), + ) + int32_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), + _compute_intn("int32", value, *indices), + uint8_value[indices], + ), + ) + + return int32_value + + return te.compute(data.shape, lambda *indices: _dispatch_sim_quantize(data)[indices]) + + +@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) +def simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1): + """Simulated QNN dequantize operator that mimics QNN outputs without changing datatype. + The benefit of this operator over true QNN dequantize is that this operator allows dynamic + datatype selection and can operate on both per-channel and scalar scales and zero points while + QNN dequantize requires both of these to be fixed at compile time. + + Parameters + ---------- + data: tvm.te.Tensor + An N-D input tensor to the operator. + + in_dtype: tvm.te.Tensor + A scalar variable that indicates which datatype to simulate dequantization with. Use + SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable + value. + + input_scale: tvm.te.Tensor, optional + A scalar tensor representing the scale to use when dequantizing from integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + input_zero_point: tvm.te.Tensor, optional + A 1-D tensor representing the zero point to use when dequantizing from integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + axis: int, optional + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + + """ + # When disabled simply return the input tensor. + def _compute_pass_through(value, *indices): + return value[indices] + + # Simulate dequantization for arbitrary integer datatypes. The computation for all datatypes is: + # DQ_output = (input - zero_point) * scale + def _compute_intn(value, *indices): + assert input_scale is not None and input_zero_point is not None + # Use indexmod to handle both scalar and per-channel QNN parameters. + scale_idx = tir.indexmod(indices[axis], topi.shape(input_scale)[0]) + zp_idx = tir.indexmod(indices[axis], topi.shape(input_zero_point)[0]) + return (value[indices] - input_zero_point[zp_idx]) * input_scale[scale_idx] + + # Use an if chain to dynamically return the proper dequantization based on the input datatype. + # This allows the op to compile once but apply different quantization approaches + # using a variable datatype input. + def _dispatch_sim_dequantize(value): + pass_through_value = te.compute( + data.shape, lambda *indices: _compute_pass_through(value, *indices) + ) + intn_condition = tvm.te.any( + in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), + in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), + in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), + ) + intn_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + intn_condition, + _compute_intn(value, *indices), + pass_through_value[indices], + ), + ) + + return intn_value + + return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices]) diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc index 724441e0c5231..b0fe9356a758a 100644 --- a/src/relay/qnn/op/dequantize.cc +++ b/src/relay/qnn/op/dequantize.cc @@ -53,7 +53,7 @@ bool DequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* dequantize_attrs = attrs.as(); int axis = dequantize_attrs->axis; - axis = (axis == -1) ? data->shape.size() - 1 : axis; + axis = (axis < 0) ? data->shape.size() + axis : axis; ICHECK_LT(axis, static_cast(data->shape.size())) << "axis " << dequantize_attrs->axis << " is out of range"; ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range"; @@ -81,7 +81,7 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale, const Expr& input_zero_point, const Array& types, const DequantizeAttrs* attrs) { - const auto axis = attrs->axis; + auto axis = attrs->axis; ICHECK_EQ(types.size(), 4); auto in_type = types[0]; @@ -92,6 +92,11 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale, size_t n_dim = input_shape.size(); + // Wrap axis from negative to positive if needed. + if (axis < 0) { + axis = static_cast(n_dim) + axis; + } + // Expand scale and zero point if the input tensor is channel quantized auto expanded_input_scale = input_scale; if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) { diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc index 9829834f43a3b..751abfc5ca81d 100644 --- a/src/relay/qnn/op/quantize.cc +++ b/src/relay/qnn/op/quantize.cc @@ -19,8 +19,8 @@ /*! * \file src/relay/qnn/op/quantize.cc - * \brief QNN dequantize operator. Dequantize operator converts from quantized - * domain to unquantized domain. + * \brief QNN quantize operator. Quantize operator converts from unquantized + * domain to quantized domain. */ #include @@ -51,7 +51,7 @@ bool QuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* quantize_attrs = attrs.as(); int axis = quantize_attrs->axis; - axis = (axis == -1) ? data->shape.size() - 1 : axis; + axis = (axis < 0) ? data->shape.size() + axis : axis; ICHECK_LT(axis, static_cast(data->shape.size())) << "axis " << quantize_attrs->axis << " is out of range"; ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range"; @@ -93,10 +93,15 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale, Array input_shape = in_tensor_type->shape; const auto out_dtype = attrs->out_dtype; - const auto axis = attrs->axis; + auto axis = attrs->axis; size_t n_dim = input_shape.size(); + // Wrap axis from negative to positive if needed. + if (axis < 0) { + axis = static_cast(n_dim) + axis; + } + auto expanded_output_scale = output_scale; if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) { expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis}); diff --git a/src/relay/qnn/op/simulated_dequantize.cc b/src/relay/qnn/op/simulated_dequantize.cc new file mode 100644 index 0000000000000..e1fc47d700c9b --- /dev/null +++ b/src/relay/qnn/op/simulated_dequantize.cc @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/simulated_dequantize.cc + * \brief QNN simulated dequantize operator. Mimics the behavior + * of QNN dequantize in floating point with added flexibility. + */ + +#include +#include +#include + +#include "../../transforms/pattern_utils.h" +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace qnn { + +bool SimulatedDequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types = [data_type, datatype_type, scale_type, zp_type, ret_type] + ICHECK_EQ(types.size(), 5); + const auto* data = types[0].as(); + const auto* dtype = types[1].as(); + + if ((data == nullptr) || (dtype == nullptr)) { + return false; + } + + // assign output type + reporter->Assign(types[4], TensorType(data->shape, data->dtype)); + return true; +} + +Expr MakeSimulatedDequantize(Expr data, Expr in_dtype, Expr input_scale, Expr input_zero_point, + int axis) { + auto attrs = make_object(); + attrs->axis = axis; + static const Op& op = Op::Get("qnn.simulated_dequantize"); + return Call(op, {data, in_dtype, input_scale, input_zero_point}, Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.simulated_dequantize") + .describe(R"code(Simulates the functionality of qnn.dequantize but allows more flexible + dynamic input type conversion and always operates on float values. +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(4) + .add_argument("data", "Tensor", "The tensor to dequantize.") + .add_argument("in_dtype", "Tensor", + "A code corresponding to the type of quantization to convert from.") + .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .set_support_level(11) + .add_type_rel("QNNSimulatedDequantize", SimulatedDequantizeRel); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_dequantize") + .set_body_typed(MakeSimulatedDequantize); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/src/relay/qnn/op/simulated_quantize.cc b/src/relay/qnn/op/simulated_quantize.cc new file mode 100644 index 0000000000000..089762a6ade0d --- /dev/null +++ b/src/relay/qnn/op/simulated_quantize.cc @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/simulated_quantize.cc + * \brief QNN simulated quantize operator. Mimics the behavior + * of QNN quantize in floating point with added flexibility. + */ + +#include +#include +#include + +#include "../../transforms/pattern_utils.h" +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace qnn { + +TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs); + +bool SimulatedQuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types = [data_type, datatype_type, scale_type, zp_type, ret_type] + ICHECK_EQ(types.size(), 5); + const auto* data = types[0].as(); + const auto* dtype = types[1].as(); + + if ((data == nullptr) || (dtype == nullptr)) { + return false; + } + + // assign output type + reporter->Assign(types[4], TensorType(data->shape, data->dtype)); + return true; +} + +Expr MakeSimulatedQuantize(Expr data, Expr out_dtype, Expr output_scale, Expr output_zero_point, + int axis) { + auto attrs = make_object(); + attrs->axis = axis; + static const Op& op = Op::Get("qnn.simulated_quantize"); + return Call(op, {data, out_dtype, output_scale, output_zero_point}, Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.simulated_quantize") + .describe(R"code(Simulates the functionality of qnn.quantize but allows more flexible + dynamic input type conversion and always outputs float values. +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(4) + .add_argument("data", "Tensor", "The tensor to quantize.") + .add_argument("out_dtype", "Tensor", + "A code corresponding to the type of quantization to apply.") + .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.") + .add_argument("output_zero_point", "Tensor", + "The quantization zero_point of the output tensor.") + .set_support_level(11) + .add_type_rel("QNNSimulatedQuantize", SimulatedQuantizeRel); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_quantize").set_body_typed(MakeSimulatedQuantize); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py index e7fb161a13cb9..1833458fdb755 100644 --- a/tests/python/relay/test_op_qnn_dequantize.py +++ b/tests/python/relay/test_op_qnn_dequantize.py @@ -98,7 +98,7 @@ def test_channelwise_axis_1(): } dequantize_test_driver( - in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=1 + in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1 ) diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py index 2ef298679904c..b300c56121747 100644 --- a/tests/python/relay/test_op_qnn_quantize.py +++ b/tests/python/relay/test_op_qnn_quantize.py @@ -127,7 +127,7 @@ def test_channelwise_axis_1(): quantize_test_driver( in_dtype="float32", quant_args=quant_args, - axis=1, + axis=-1, out_dtype="uint8", in_data=data, verify_output_data=output, diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py new file mode 100644 index 0000000000000..0cc04e4998eb2 --- /dev/null +++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import te +import numpy as np +from tvm import relay +from tvm.contrib import graph_runtime +from tvm.runtime.vm import VirtualMachine +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE + + +def dequantize_test_driver(in_dtype, quant_args, axis, in_data): + shape = in_data.shape + input_data = relay.var("input_data", shape=shape, dtype=in_dtype) + input_zero_point = relay.const(quant_args["in_zero_point"]) + input_scale = relay.const(quant_args["in_scale"]) + dequantized_output = relay.qnn.op.dequantize( + input_data, + input_scale=input_scale, + input_zero_point=input_zero_point, + axis=axis, + ) + mod = relay.Function(relay.analysis.free_vars(dequantized_output), dequantized_output) + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + graph, lib, params = relay.build(mod, "llvm", params=None) + rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod.set_input(input_data=in_data) + rt_mod.set_input(**params) + rt_mod.run() + res = rt_mod.get_output(0).asnumpy() + return res + + +def build_simulated_dequantize(input_data, scale, zp, dtype, axis=-1): + sim_q = relay.qnn.op.simulated_dequantize( + input_data, + scale, + zp, + axis=axis, + in_dtype=dtype, + ) + mod = tvm.IRModule.from_expr(sim_q) + with tvm.transform.PassContext(opt_level=3): + vm_exec = relay.vm.compile(mod, "llvm", params=None) + vm = VirtualMachine(vm_exec, tvm.cpu(0)) + return vm + + +def verify_simulated_dequantize_simple(dtype): + data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype(dtype) + data_fp = data.astype("float32") + scale_np = np.float32(0.5) + zp_np = np.int32(127) + dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype]) + quant_args = {"in_zero_point": zp_np, "in_scale": scale_np} + dq_out = dequantize_test_driver( + in_dtype=dtype, + quant_args=quant_args, + axis=-1, + in_data=data, + ) + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[]) + zp = relay.var("zp", shape=[]) + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + + +def test_simulated_dequantize(): + verify_simulated_dequantize_simple("uint8") + verify_simulated_dequantize_simple("int8") + verify_simulated_dequantize_simple("int32") + + +def test_dynamic_channels(): + # Compile simulated quantize once but support either per-channel or scalar params. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("int8") + data_fp = data.astype("float32") + # Test scalar qnn params. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([0]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"]) + quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]} + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=0, + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + + # Now get the perchannel quantize output and compare without recompiling. + scale_np = np.array([0.5, 0.25]).astype("float32") + zp_np = np.array([127, 123]).astype("int32") + + # Get the reference quantize output. + quant_args = {"in_zero_point": zp_np, "in_scale": scale_np} + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=0, + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + + +def test_dynamic_dtype(): + # Compile simulated quantize once but support any type of quantization. + data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("uint8") + data_fp = data.astype("float32") + # Test scalar uint8 to fp32. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]} + dq_out = dequantize_test_driver( + in_dtype="uint8", + quant_args=quant_args, + axis=-1, + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + + # Now test int8 to float32 compilation. + data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8") + data_fp = data.astype("float32") + # Get the reference quantize output. + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=-1, + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"]) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + + +if __name__ == "__main__": + test_simulated_dequantize() + test_dynamic_channels() + test_dynamic_dtype() diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py new file mode 100644 index 0000000000000..ee4ba209dcb8f --- /dev/null +++ b/tests/python/relay/test_op_qnn_simulated_quantize.py @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import te +import numpy as np +from tvm import relay +from tvm.contrib import graph_runtime +from tvm.runtime.vm import VirtualMachine +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE + + +def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data): + shape = in_data.shape + input_data = relay.var("input_data", shape=shape, dtype=in_dtype) + output_zero_point = relay.const(quant_args["out_zero_point"]) + output_scale = relay.const(quant_args["out_scale"]) + quantized_output = relay.qnn.op.quantize( + input_data, + output_scale=output_scale, + output_zero_point=output_zero_point, + axis=axis, + out_dtype=out_dtype, + ) + mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output) + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + graph, lib, params = relay.build(mod, "llvm", params=None) + rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod.set_input(input_data=in_data) + rt_mod.set_input(**params) + rt_mod.run() + res = rt_mod.get_output(0).asnumpy() + return res + + +def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1): + sim_q = relay.qnn.op.simulated_quantize( + input_data, + scale, + zp, + axis=axis, + out_dtype=dtype, + ) + mod = tvm.IRModule.from_expr(sim_q) + with tvm.transform.PassContext(opt_level=3): + vm_exec = relay.vm.compile(mod, "llvm", params=None) + vm = VirtualMachine(vm_exec, tvm.cpu(0)) + return vm + + +def verify_simulated_quantize_simple(dtype): + data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype("float32") + scale_np = np.float32(0.5) + zp_np = np.int32(127) + dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype]) + quant_args = {"out_zero_point": zp_np, "out_scale": scale_np} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype=dtype, + in_data=data, + ) + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[]) + zp = relay.var("zp", shape=[]) + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + + +def test_simulated_quantize(): + verify_simulated_quantize_simple("uint8") + verify_simulated_quantize_simple("int8") + verify_simulated_quantize_simple("int32") + + +def test_dynamic_channels(): + # Compile simulated quantize once but support either per-channel or scalar params. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32") + # Test scalar qnn params. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=0, + out_dtype="uint8", + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + + # Now get the perchannel quantize output and compare without recompiling. + scale_np = np.array([0.5, 0.25]).astype("float32") + zp_np = np.array([127, 123]).astype("int32") + + # Get the reference quantize output. + quant_args = {"out_zero_point": zp_np, "out_scale": scale_np} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=0, + out_dtype="uint8", + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + + +def test_dynamic_dtype(): + # Compile simulated quantize once but support any type of quantization. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32") + # Test scalar float32 to uint8. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype="uint8", + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + + # Now test float32 to int32 compilation. + # Get the reference quantize output. + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype="int32", + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"]) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + + +if __name__ == "__main__": + test_simulated_quantize() + test_dynamic_channels() + test_dynamic_dtype() diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py new file mode 100644 index 0000000000000..a63f34fe08d09 --- /dev/null +++ b/tests/python/topi/python/test_topi_qnn.py @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test code for QNN operators.""" +import numpy as np +import tvm +from tvm import topi, relay, te +from tvm.contrib import graph_runtime +import tvm.topi.testing + + +def verify_simulated_quantize(data_shape, out_dtype, channels, axis): + # Create placeholder variables for all qnn inputs. + A = te.placeholder(data_shape, name="value", dtype="float32") + D = te.placeholder([], name="dtype", dtype="int32") + S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32") + Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32") + SIM_Q = topi.nn.simulated_quantize(A, D, output_scale=S, output_zero_point=Z, axis=axis) + + # Create random numpy values to assign to inputs. + a_np = np.random.uniform(size=data_shape).astype("float32") + d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[out_dtype]) + s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32") + z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") + q_np = np.zeros(shape=data_shape, dtype="float32") + + def check_device(device, ctx): + # Wrap the numpy arrays in nd arrays. + a = tvm.nd.array(a_np, ctx) + d = tvm.nd.array(d_np, ctx) + s = tvm.nd.array(s_np, ctx) + z = tvm.nd.array(z_np, ctx) + q = tvm.nd.array(q_np, ctx) + + # Construct equivalent relay graph. + per_channel = channels[0] != 1 + a_var = relay.var("a", shape=data_shape, dtype="float32") + if per_channel: + s_var = relay.const(s_np) + z_var = relay.const(z_np) + else: + s_var = relay.const(s_np[0]) + z_var = relay.const(z_np[0]) + real_q_op = relay.qnn.op.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=device) + + # Get real qnn quantize output. + m = graph_runtime.GraphModule(lib["default"](ctx)) + m.set_input("a", a_np) + + m.run() + real_q_out = m.get_output(0) + + # Compile the simulated quantize function. + with tvm.target.Target(device): + sched = tvm.topi.testing.get_injective_schedule(device)(SIM_Q) + func = tvm.build(sched, [A, D, S, Z, SIM_Q], device, name="sim_quantize") + func(a, d, s, z, q) + + # Check correctness against the true qnn output. + tvm.testing.assert_allclose(q.asnumpy(), real_q_out.asnumpy().astype("float32")) + + for target, ctx in tvm.testing.enabled_targets(): + check_device(target, ctx) + + +def test_simulated_quantize(): + verify_simulated_quantize([1], "int8", [1], -1) + verify_simulated_quantize([2, 5], "int8", [5], 1) + verify_simulated_quantize([1, 32, 32, 32], "int8", [32], -1) + verify_simulated_quantize([1, 32, 32, 32], "uint8", [32], -2) + verify_simulated_quantize([2, 5], "int32", [5], 1) + + +def verify_simulated_dequantize(data_shape, in_dtype, channels, axis): + # Create placeholder variables for all qnn inputs. + A = te.placeholder(data_shape, name="value", dtype="float32") + D = te.placeholder([], name="dtype", dtype="int32") + S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32") + Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32") + SIM_DQ = topi.nn.simulated_dequantize(A, D, input_scale=S, input_zero_point=Z, axis=axis) + + # Create random numpy values to assign to inputs. + a_np = np.random.uniform(low=-128, high=127, size=data_shape).astype(in_dtype) + a_np_f = a_np.astype("float32") + d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[in_dtype]) + s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32") + z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") + dq_np = np.zeros(shape=data_shape, dtype="float32") + + def check_device(device, ctx): + # Wrap the numpy arrays in nd arrays. + a = tvm.nd.array(a_np_f, ctx) + d = tvm.nd.array(d_np, ctx) + s = tvm.nd.array(s_np, ctx) + z = tvm.nd.array(z_np, ctx) + dq = tvm.nd.array(dq_np, ctx) + + # Construct equivalent relay graph. + per_channel = channels[0] != 1 + a_var = relay.var("a", shape=data_shape, dtype=in_dtype) + if per_channel: + s_var = relay.const(s_np) + z_var = relay.const(z_np) + else: + s_var = relay.const(s_np[0]) + z_var = relay.const(z_np[0]) + real_dq_op = relay.qnn.op.dequantize(a_var, s_var, z_var, axis=axis) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=device) + + # Get real qnn quantize output. + m = graph_runtime.GraphModule(lib["default"](ctx)) + m.set_input("a", a_np) + + m.run() + real_dq_out = m.get_output(0) + + # Compile the simulated quantize function. + with tvm.target.Target(device): + sched = tvm.topi.testing.get_injective_schedule(device)(SIM_DQ) + func = tvm.build(sched, [A, D, S, Z, SIM_DQ], device, name="sim_quantize") + func(a, d, s, z, dq) + + # Check correctness against the true qnn output. + tvm.testing.assert_allclose(dq.asnumpy(), real_dq_out.asnumpy().astype("float32")) + + for target, ctx in tvm.testing.enabled_targets(): + check_device(target, ctx) + + +def test_simulated_dequantize(): + verify_simulated_dequantize([1], "int8", [1], -1) + verify_simulated_dequantize([2, 5], "int8", [5], 1) + verify_simulated_dequantize([2, 5], "int8", [2], 0) + verify_simulated_dequantize([1, 32, 32, 32], "int8", [32], -1) + verify_simulated_dequantize([1, 32, 32, 32], "uint8", [32], -2) + verify_simulated_dequantize([2, 5], "int32", [5], 1) + + +if __name__ == "__main__": + test_simulated_quantize() + test_simulated_dequantize() From 68b81ade90d8011d4f2cdbdaf0230f4f9d49196f Mon Sep 17 00:00:00 2001 From: apeskov Date: Fri, 12 Mar 2021 04:54:35 +0300 Subject: [PATCH 10/84] Introduce Apple BNNS backend (#7299) * Introduce Apple BNNS backend This is simple JSON based runtime which offload execution of some operation into Accelerate frameworks via BNNS api. Works only for: * macOS 11.0 and later * iOS 14.0 and later Supported primitives: * conv2d and fusing with bias and relu * dense and fusing with bias and relu/gelu * batch_matmul Signed-off-by: Alexander Peskov * [BNNS] Add conv2d DW test Also fix some pylint issues Signed-off-by: Alexander Peskov * [BNNS] Fix clang-format issues Signed-off-by: Alexander Peskov * [BNNS] Refactoring. Add TView abstraction Signed-off-by: Alexander Peskov * [BNNS] Add several more onnx topologies into tests Signed-off-by: Alexander Peskov * [BNNS] Avoid redundant tensor allocation Signed-off-by: Alexander Peskov * [BNNS] Fix conv_splitter issue Signed-off-by: Alexander Peskov * [BNNS] Fix isse with bias {1,1,1,1} Signed-off-by: Alexander Peskov * [BNNS] Min. Rename file Signed-off-by: Alexander Peskov * Fix review comments. Initial Signed-off-by: Alexander Peskov * [BNNS] test refactoring Signed-off-by: Alexander Peskov * [BNNS] Fix cpplint issues Signed-off-by: Alexander Peskov * [BNNS] Fix clang-format issues Signed-off-by: Alexander Peskov * Fix python format Signed-off-by: Alexander Peskov * Fix pylint issues Signed-off-by: Alexander Peskov * [BNNS] Fix pylint. Second attempt Signed-off-by: Alexander Peskov * [BNNS] Add integration documentation * Check onnx import before use Signed-off-by: Alexander Peskov * [BNNS] Add instance normalization operator * Add fusing sigmoid activation after conv2d * min changes Signed-off-by: Alexander Peskov * Add pooling operations to BNNS runtime Supports `nn.max_pool2d`, `nn.avg_pool2d`, `nn.global_max_pool2d` and `nn.global_avg_pool2d` operations * Fix lint * Fix lint * Apply comments * Fix documentation * Fix comment to refer to BNNS Co-authored-by: dlexplorer Co-authored-by: Egor Churaev --- CMakeLists.txt | 2 + cmake/config.cmake | 3 + cmake/modules/contrib/BNNS.cmake | 30 + docs/deploy/bnns.rst | 183 ++++++ docs/deploy/index.rst | 1 + python/tvm/driver/tvmc/composite_target.py | 5 + python/tvm/relay/op/contrib/__init__.py | 1 + python/tvm/relay/op/contrib/bnns.py | 327 ++++++++++ src/relay/backend/contrib/bnns/codegen.cc | 215 +++++++ src/runtime/contrib/bnns/bnns_json_runtime.cc | 573 ++++++++++++++++++ src/runtime/contrib/bnns/bnns_wrp.h | 495 +++++++++++++++ tests/cpp/contrib/bnns.cc | 307 ++++++++++ tests/python/contrib/test_bnns/__init__.py | 17 + .../contrib/test_bnns/infrastructure.py | 330 ++++++++++ tests/python/contrib/test_bnns/test_conv2d.py | 177 ++++++ .../contrib/test_bnns/test_conv2d_patterns.py | 107 ++++ tests/python/contrib/test_bnns/test_dense.py | 190 ++++++ tests/python/contrib/test_bnns/test_matmul.py | 113 ++++ .../contrib/test_bnns/test_normalization.py | 201 ++++++ .../contrib/test_bnns/test_onnx_topologies.py | 140 +++++ .../python/contrib/test_bnns/test_pooling.py | 289 +++++++++ 21 files changed, 3706 insertions(+) create mode 100644 cmake/modules/contrib/BNNS.cmake create mode 100644 docs/deploy/bnns.rst create mode 100644 python/tvm/relay/op/contrib/bnns.py create mode 100644 src/relay/backend/contrib/bnns/codegen.cc create mode 100644 src/runtime/contrib/bnns/bnns_json_runtime.cc create mode 100644 src/runtime/contrib/bnns/bnns_wrp.h create mode 100644 tests/cpp/contrib/bnns.cc create mode 100644 tests/python/contrib/test_bnns/__init__.py create mode 100644 tests/python/contrib/test_bnns/infrastructure.py create mode 100644 tests/python/contrib/test_bnns/test_conv2d.py create mode 100644 tests/python/contrib/test_bnns/test_conv2d_patterns.py create mode 100644 tests/python/contrib/test_bnns/test_dense.py create mode 100644 tests/python/contrib/test_bnns/test_matmul.py create mode 100644 tests/python/contrib/test_bnns/test_normalization.py create mode 100644 tests/python/contrib/test_bnns/test_onnx_topologies.py create mode 100644 tests/python/contrib/test_bnns/test_pooling.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 56170c693e3c4..1d2e2bcb68c4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,7 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF) tvm_option(USE_TFLITE "Build with tflite support" OFF) tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none) tvm_option(USE_COREML "Build with coreml support" OFF) +tvm_option(USE_BNNS "Build with BNNS support" OFF) tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF) tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF) tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF) @@ -354,6 +355,7 @@ include(cmake/modules/contrib/HybridDump.cmake) include(cmake/modules/contrib/TFLite.cmake) include(cmake/modules/contrib/TF_TVMDSOOP.cmake) include(cmake/modules/contrib/CoreML.cmake) +include(cmake/modules/contrib/BNNS.cmake) include(cmake/modules/contrib/ONNX.cmake) include(cmake/modules/contrib/ArmComputeLib.cmake) include(cmake/modules/contrib/TensorRT.cmake) diff --git a/cmake/config.cmake b/cmake/config.cmake index 30c21f707c08b..67370c6352099 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -272,3 +272,6 @@ set(USE_HEXAGON_SDK /path/to/sdk) # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) + +# Whether enable BNNS runtime +set(USE_BNNS OFF) diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake new file mode 100644 index 0000000000000..e14aa2857ebcc --- /dev/null +++ b/cmake/modules/contrib/BNNS.cmake @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(USE_BNNS STREQUAL "ON") + add_definitions(-DUSE_JSON_RUNTIME=1) + file(GLOB BNNS_RELAY_CONTRIB_SRC src/relay/backend/contrib/bnns/*.cc) + list(APPEND COMPILER_SRCS ${BNNS_RELAY_CONTRIB_SRC}) + list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC}) + + list(APPEND TVM_RUNTIME_LINKER_LIBS "-framework Accelerate") + + file(GLOB BNNS_CONTRIB_SRC src/runtime/contrib/bnns/*.cc) + list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC}) + message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS}) +endif() + diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst new file mode 100644 index 0000000000000..cb15a4f3bd545 --- /dev/null +++ b/docs/deploy/bnns.rst @@ -0,0 +1,183 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Relay BNNS Integration +====================== +**Author**: `Egor Churaev `_ + +Introduction +------------ + +Apple BNNS library is a collection of functions that can be used to construct neural networks +for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides +primitives executed on all CPU supported on those platforms and optimized for high performance +and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS. + +BNNS runtime is a part of platform API and available on all modern Apple operating systems. +Application using BNNS will not depends on any additional external dependencies. + +BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example +of such capabilities can be AMX Apple cpu extension. + +This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example +code to compile and run models using BNNS runtime. Finally, we document the supported operators. + +Building TVM with BNNS support +------------------------------ + +To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag + +* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives + and will link tvm library to the BNNS runtime module. + +Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK. +The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0. + +Example setting in config.cmake file: + +.. code:: cmake + + set(USE_BNNS ON) + +BNNS partitioning of Relay graph +-------------------------------- + +Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation. +All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops +will go through the LLVM compilation and code generation. + +Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have +to map constants to related tensor abstraction in relay representation. To freeze tensors and operate +with them as constants you may need to call ONNX importer with special flag "freeze_params=True" +or performer binding manually. In general cases all relay importers don't do that by default. +For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument. + +.. code:: python + + from tvm.relay.op.contrib.bnns import partition_for_bnns + model = partition_for_bnns(model, params=params) + + +Input data layout for operations to be offloaded to BNNS execution +------------------------------------------------------------------ + +BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input +layout for conv2d input. + +To use BNNS integration for models with interleave input layout, they should be converted before +passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly +enumerated types of ops. It might happen that depending on topology there might be regular data reorder +around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect +execution time. It is recommended to analyze the whole topology and extend below list to convert all +intermediate tensors to NCHW data layout. + +Example of input layouts change: + +.. code:: python + + # For models with NHWC input layout + with tvm.transform.PassContext(opt_level=3): + mod = relay.transform.InferType()(mod) + mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"], + "nn.bias_add": ["NCHW", "default"], + "nn.relu": ["NCHW"]})(mod) + + +Example: Build and Deploy Mobilenet v2 1.0 with BNNS +---------------------------------------------------- + +Create a Relay graph from a MXNet Mobilenet v2 1.0 model. + +.. code:: python + + import tvm + from tvm import relay + import mxnet + from mxnet.gluon.model_zoo.vision import get_model + + dtype = "float32" + input_shape = (1, 3, 224, 224) + block = get_model('mobilenetv2_1.0', pretrained=True) + module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) + + +Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS +integration will be handled by BNNS invocations, the rest of the ops will go through the +regular TVM llvm compilation and code generation. + +After that you need to compile new module with target corresponding to required Apple platform + +.. code:: python + + from tvm.relay.op.contrib.bnns import partition_for_bnns + + # target for macOS Big Sur 11.1: + target = "llvm -mtriple=x86_64-apple-darwin20.2.0" + + model = partition_for_bnns(model, params=params) # to markup operations to be offloaded to BNNS + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(model, target=target, target_host=target, params=params) + +Export the module. + +.. code:: python + + lib.export_library('compiled.dylib') + + +Load module and run inference on the target machine with TVM built with ``USE_BNNS`` enabled + +.. code:: python + + import tvm + import numpy as np + from tvm.contrib import graph_runtime + + ctx = tvm.cpu(0) + loaded_lib = tvm.runtime.load_module('compiled.dylib') + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) + + dtype = "float32" + input_shape = (1, 3, 224, 224) + input_data = np.random.uniform(0, 1, input_shape).astype(dtype) + gen_module.run(data=input_data) + + + +Operator support +---------------- + ++------------------------+------------------------------------------------------------------------------+ +| Relay Node | Remarks | ++========================+==============================================================================+ +| nn.conv2d | | ++------------------------+------------------------------------------------------------------------------+ +| nn.batch_norm | Supported by BNNS integration only in nn.conv2d-batch_norm pattern | ++------------------------+------------------------------------------------------------------------------+ +| nn.dense | | ++------------------------+------------------------------------------------------------------------------+ +| nn.batch_matmul | | ++------------------------+------------------------------------------------------------------------------+ +| nn.bias_add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense | +| | fusion | ++------------------------+------------------------------------------------------------------------------+ +| add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ +| nn.relu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ +| nn.gelu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst index 2b37f734c3c38..3cbbb10bd74b8 100644 --- a/docs/deploy/index.rst +++ b/docs/deploy/index.rst @@ -71,3 +71,4 @@ target device without relying on RPC. see the following resources on how to do s arm_compute_lib tensorrt vitis_ai + bnns diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py index 0a25926856468..886160ad000c1 100644 --- a/python/tvm/driver/tvmc/composite_target.py +++ b/python/tvm/driver/tvmc/composite_target.py @@ -21,6 +21,7 @@ from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib from tvm.relay.op.contrib.ethosn import partition_for_ethosn +from tvm.relay.op.contrib.bnns import partition_for_bnns from .common import TVMCException @@ -40,6 +41,10 @@ "config_key": "relay.ext.ethos-n.options", "pass_pipeline": partition_for_ethosn, }, + "bnns": { + "config_key": None, + "pass_pipeline": partition_for_bnns, + }, } diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py index 49abf36134b4c..30c2db0ddf0b9 100644 --- a/python/tvm/relay/op/contrib/__init__.py +++ b/python/tvm/relay/op/contrib/__init__.py @@ -20,6 +20,7 @@ from .arm_compute_lib import * from .dnnl import * +from .bnns import * from .coreml import * from .ethosn import * from .tensorrt import * diff --git a/python/tvm/relay/op/contrib/bnns.py b/python/tvm/relay/op/contrib/bnns.py new file mode 100644 index 0000000000000..2ace502e6528e --- /dev/null +++ b/python/tvm/relay/op/contrib/bnns.py @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument +"""BNNS library supported operators. +Is a part of Accelerate framework on macOS/iOS platforms. Apple provide several APIs +to handle tensor processing. Particularly: + * BNNS (basic neural ) + * vDSP (1D and 2D tensor processing) +""" +import math +import tvm.ir + +from tvm.relay import transform +from tvm.relay.expr import const +from tvm.relay.build_module import bind_params_by_name + +from .register import register_pattern_table, get_pattern_table +from ...dataflow_pattern import wildcard, is_op, is_expr + + +def partition_for_bnns(mod, params=None): + """Partition the graph greedily offloading supported + operators to BNNS. + + Parameters + ---------- + mod : Module + The module to run passes on. + params : Optional[Dict[str, NDArray]] + Constant input parameters. + + Returns + ------- + ret : annotated and partitioned module. + """ + if params: + mod["main"] = bind_params_by_name(mod["main"], params) + + seq = tvm.transform.Sequential( + [ + transform.InferType(), + transform.FoldConstant(), + transform.FoldScaleAxis(), + transform.DynamicToStatic(), + transform.AlterOpLayout(), + # TODO(apeskov): WA. AlterOpLayout call lead to constants shape transformation + # Some expand_dims op may appears after constants. It breaks BNNS fusing. + # So we have to call FoldConstant right before bnns composite passes. + transform.FoldConstant(), + transform.MergeComposite(get_pattern_table("bnns")), + transform.AnnotateTarget("bnns"), + # If you no need in per layer performance statistic you can + # uncomment next line + # transform.MergeCompilerRegions(), + transform.PartitionGraph(), + ] + ) + + return seq(mod) + + +def _register_external_op_helper(op_name, supported=True): + """The helper function to indicate that a given operator can be supported + by BNNS. + + Parameters + ---------- + op_name : Str + The name of supported operator that will be registered. + + Returns + ------- + f : callable + A function that returns if the operator is supported by BNNS. + """ + + @tvm.ir.register_op_attr(op_name, "target.bnns") + def _func_wrapper(expr): + return supported + + return _func_wrapper + + +_register_external_op_helper("nn.batch_matmul") + + +@tvm.ir.register_op_attr("nn.max_pool2d", "target.bnns") +def max_pool2d_check(expr): + """Check if the nn.max_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.avg_pool2d", "target.bnns") +def avg_pool2d_check(expr): + """Check if the nn.avg_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.bnns") +def global_max_pool2d_check(expr): + """Check if the nn.global_max_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.bnns") +def global_avg_pool2d_check(expr): + """Check if the nn.global_avg_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +def dtype_is_supported(dtype): + """Check if data type is supported by BNNS backend""" + return dtype in ("", "float32") + + +@tvm.ir.register_op_attr("nn.conv2d", "target.bnns") +def conv2d_check(expr): + """Check if the conv2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + if len(data_typ.shape) != 4 or data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + kernel_typ = args[1].checked_type + if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32": + return False + if attrs.data_layout != "NCHW": + return False + if not dtype_is_supported(attrs.out_dtype): + return False + return True + + +def bias_check(expr): + """Check is bias added through the correct dimension""" + attrs, args = expr.attrs, expr.args + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + if expr.op.name == "nn.bias_add": + return attrs.axis == 1 + if expr.op.name == "add": + b_shape = args[1].checked_type.shape + if len(b_shape) == 4: + return bool(b_shape[0] == 1 and b_shape[2] == 1 and b_shape[3] == 1) + if len(b_shape) == 3: + return bool(b_shape[1] == 1 and b_shape[2] == 1) + + return False + + +@tvm.ir.register_op_attr("nn.dense", "target.bnns") +def dense(expr): + """Check if the dense can be used in BNNS.""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + if data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + kernel_typ = args[1].checked_type + if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32": + return False + if attrs.out_dtype != "float32" and attrs.out_dtype != "": + return False + return True + + +def make_conv_pattern(with_bias=True, activation="none"): + """Make pattern for bnns.conv2d primitive""" + data = wildcard() + weight = wildcard() + bias = wildcard() + pat = is_op("nn.conv2d")(data, weight) + if with_bias: + pat = is_op("add")(pat, bias) | is_op("nn.bias_add")(pat, bias) + if activation == "relu": + pat = is_op("nn.relu")(pat) + elif activation == "sigmoid": + pat = is_op("sigmoid")(pat) + return pat + + +def check_conv(extract): + """Check conv pattern is supported by BNNS.""" + bias_is_ok = True + call = extract + while call.op.name != "nn.conv2d": + if call.op.name in ("nn.bias_add", "add"): + bias_is_ok &= bias_check(call) + call = call.args[0] + return conv2d_check(call) and bias_is_ok + + +def make_dense_bias_pattern(): + """Make pattern for bnns.dense primitive""" + data = wildcard() + weight = wildcard() + bias = wildcard() + d = is_op("nn.dense")(data, weight) + return is_op("add")(d, bias) + + +def make_dense_bias_gelu_pattern(): + """Make pattern for bnns.dense primitive with fused bias and gelu activation""" + dense_bias = make_dense_bias_pattern() + const1 = is_expr(const(0.044715)) + const2 = is_expr(const(math.sqrt(2 / math.pi))) + + gelu = is_op("power")(dense_bias, is_expr(const(3, dtype="float32"))) + gelu = is_op("multiply")(gelu, const1) + gelu = is_op("add")(gelu, dense_bias) + gelu = is_op("multiply")(gelu, const2) + gelu = is_op("tanh")(gelu) + gelu = is_op("add")(gelu, is_expr(const(1, dtype="float32"))) + gelu = is_op("multiply")(gelu, is_expr(const(0.5))) + gelu = is_op("multiply")(gelu, dense_bias) + return gelu + + +def check_dense(extract): + """Check dense pattern is supported by BNNS.""" + call = extract + while call.op.name != "nn.dense": + call = call.args[0] + return dense(call) + + +@tvm.ir.register_op_attr("nn.instance_norm", "target.bnns") +def instance_norm_check(expr): + """Check if the nn.instance_norm can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant) or not isinstance( + args[2], tvm.relay.expr.Constant + ): + return False + if attrs.axis == 0 and rank == 3 or attrs.axis == 1 and rank == 4: + return True + return False + + +@register_pattern_table("bnns") +def pattern_table(): + """Get BNNS specific fusing patterns collection""" + conv2d_bias_pat = ( + "bnns.conv2d_bias", + make_conv_pattern(with_bias=True), + check_conv, + ) + conv2d_bias_relu_pat = ( + "bnns.conv2d_bias_relu", + make_conv_pattern(with_bias=True, activation="relu"), + check_conv, + ) + conv2d_relu_pat = ( + "bnns.conv2d_relu", + make_conv_pattern(with_bias=False, activation="relu"), + check_conv, + ) + conv2d_bias_sigmoid_pat = ( + "bnns.conv2d_bias_sigmoid", + make_conv_pattern(with_bias=True, activation="sigmoid"), + check_conv, + ) + conv2d_sigmoid_pat = ( + "bnns.conv2d_sigmoid", + make_conv_pattern(with_bias=False, activation="sigmoid"), + check_conv, + ) + dense_bias_gelu = ("bnns.dense_bias_gelu", make_dense_bias_gelu_pattern(), check_dense) + dense_bias = ("bnns.dense_bias", make_dense_bias_pattern(), check_dense) + bnns_patterns = [ + conv2d_bias_relu_pat, + conv2d_relu_pat, + conv2d_bias_sigmoid_pat, + conv2d_sigmoid_pat, + conv2d_bias_pat, + dense_bias_gelu, + dense_bias, + ] + return bnns_patterns diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc new file mode 100644 index 0000000000000..72c32fb5b19ee --- /dev/null +++ b/src/relay/backend/contrib/bnns/codegen.cc @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file + * \brief Implementation of BNNS codegen APIs. + */ + +#include +#include +#include +#include + +#include +#include + +#include "../../../../runtime/contrib/json/json_node.h" +#include "../../utils.h" +#include "../codegen_json/codegen_json.h" + +namespace tvm { +namespace relay { +namespace contrib { + +using namespace backend; + +/*! + * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in + * relu(add(conv2d)) + * \param call A Relay call node. Typically nn.relu when called the first time. + * \param max_depth The maximum number of calls before the root op, counting from current_call. + * \param root_name The name of expected "root" op in this fused call. + * \return A CallNode corresponding to the root op + */ +inline const CallNode* FindCallWithName(const CallNode* current_call, int max_depth, + const std::string& root_name) { + ICHECK(current_call && max_depth >= 0); + + if (max_depth == 0) { + ICHECK(current_call && IsOp(current_call, root_name)); + return current_call; + } + if (IsOp(current_call, root_name)) { + return current_call; + } + + ICHECK_GT(current_call->args.size(), 0); + + const auto* next_call = current_call->args[0].as(); + return FindCallWithName(next_call, max_depth - 1, root_name); +} + +class BNNSJSONSerializer : public backend::contrib::JSONSerializer { + using JSONGraphNode = tvm::runtime::json::JSONGraphNode; + using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; + + public: + BNNSJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {} + + std::vector VisitExpr_(const CallNode* cn) override { + Expr expr = GetRef(cn); + std::string name; + const CallNode* call = cn; + if (const auto* op_node = cn->op.as()) { + name = op_node->name; + } else if (const auto* fn = cn->op.as()) { + auto comp = fn->GetAttr(attr::kComposite); + ICHECK(comp.defined()) << "BNNS JSON runtime only supports composite functions."; + name = comp.value(); + + auto body = fn->body.as(); + if (name == "bnns.conv2d_bias_relu") { + auto add_op_type = IsOp(body->args[0].as(), "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "nn.relu"}); + } else if (name == "bnns.conv2d_bias") { + auto add_op_type = IsOp(body, "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 1, {"nn.conv2d", add_op_type}); + } else if (name == "bnns.conv2d_relu") { + call = GetRootCall(body, 1, {"nn.conv2d", "nn.relu"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.conv2d_bias_sigmoid") { + auto add_op_type = IsOp(body->args[0].as(), "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "sigmoid"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.conv2d_sigmoid") { + call = GetRootCall(body, 1, {"nn.conv2d", "sigmoid"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.dense_bias") { + call = GetRootCall(fn->body.as(), 1, {"nn.dense", "add"}); + } else if (name == "bnns.dense_bias_gelu") { + call = FindCallWithName(fn->body.as(), 10, "nn.dense"); + } else { + LOG(FATAL) << "Unrecognized BNNS pattern: " << name; + } + } else { + LOG(FATAL) << "BNNS JSON runtime does not support calls to " << cn->op->GetTypeKey(); + } + + std::vector inputs; + for (const auto& arg : cn->args) { + auto res = VisitExpr(arg); + inputs.insert(inputs.end(), res.begin(), res.end()); + } + auto node = std::make_shared(name, /* name_ */ + "kernel", /* op_type_ */ + inputs, 1 /* num_outputs_ */); + SetCallNodeAttribute(node, call); + return AddNode(node, GetRef(cn)); + } +}; + +/*! + * \brief The external compiler/codegen tool. It takes a Relay expression/module and + * compile it into a runtime module. + */ +runtime::Module BNNSCompiler(const ObjectRef& ref) { + ICHECK(ref->IsInstance()); + auto func = Downcast(ref); + auto func_name = GetExtSymbol(func); + BNNSJSONSerializer serializer(func_name, func); + serializer.serialize(); + std::string graph_json = serializer.GetJSON(); + auto params = serializer.GetParams(); + + const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate"); + ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; + auto mod = (*pf)(func_name, graph_json, params); + return mod; +} + +TVM_REGISTER_GLOBAL("relay.ext.bnns").set_body_typed(BNNSCompiler); + +/** + * \brief A helper to expand the params by adding ones which used by BNNS runtime + * for a given expression. Same as default ConstantUpdater but skip constant from + * essential BNNS composed function ops. + */ +struct BNNSConstantUpdater : public ConstantUpdater { + public: + BNNSConstantUpdater(const std::string& symbol, + std::unordered_map* params, + const std::vector& skip_mask) + : ConstantUpdater(symbol, params), skip_mask_(skip_mask) {} + using ConstantUpdater::VisitExpr_; + + /**! + * Like an original implementation but avoid visiting of body nodes + * for BNNS specific composite primitives. + */ + void VisitExpr_(const FunctionNode* op) final { + this->VisitSpan(op->span); + for (auto param : op->params) { + this->VisitExpr(param); + } + + if (!isBNNSSpecificCompositeFunc(op)) { + this->VisitExpr(op->body); + } + } + + private: + bool isBNNSSpecificCompositeFunc(const FunctionNode* op) { + auto comp = op->GetAttr(attr::kComposite); + if (!comp) return false; + + auto comp_name = comp.value(); + + bool is_match = false; + for (const auto& mask : skip_mask_) { + if (std::string(comp_name).substr(0, mask.size()) == mask) { + is_match = true; + break; + } + } + return is_match; + } + + std::vector skip_mask_; +}; + +Map BNNSConstantUpdaterFunc(Expr expr, std::string symbol) { + std::vector bnns_composite_filter = {"bnns."}; + + // Visit all suitable constant nodes + std::unordered_map res; + BNNSConstantUpdater const_updater(symbol, &res, bnns_composite_filter); + const_updater(expr); + + // Convert to tvm::Map + Map ret; + for (const auto& kvp : res) ret.Set(kvp.first, kvp.second); + return ret; +} + +TVM_REGISTER_GLOBAL("relay.ext.bnns.constant_updater").set_body_typed(BNNSConstantUpdaterFunc); + +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc new file mode 100644 index 0000000000000..87b01567cd301 --- /dev/null +++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * \file + * \brief Simple JSON runtime for Apple BNNS primitives + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "../json/json_node.h" +#include "../json/json_runtime.h" +#include "bnns_wrp.h" + +namespace tvm { +namespace runtime { +namespace contrib { + +using namespace ::tvm::runtime; +using namespace ::tvm::runtime::json; +using namespace ::tvm::runtime::contrib::BNNS; + +struct ThreadingConfig { + /** + * Internal parallelism level ov BNNS primitive specified via BNNSFilterParameters + * struct. BNNS doesn't provide real control of internal threading, so it may be + * ignored by BNNS implementation. + * + * Valid values: + * 0 use default num of threads suggested by BNNS implementation + * >0 suggests to use this num of internal BNNS threads + */ + size_t internalConcurrency = 0; + + /** + * TVM level parallelism for BNNS runtime. + * BNNS runtime will split primitive into set of independent sub primitives which + * can be executed in parallel. As a rule the splitting are performed through output + * channels, so the effective shape of executed primitive is changed. + * + * Valid values: + * 0 do not use graph level treading + * >0 split into this num of primitives + */ + size_t externalConcurrency = 0; +}; + +/** + * Depends on platform hardware the optimal ThreadingConfig may differ. + * This function contains a priori knowledge about some Apple platforms + * and their specific. + * + * @return default ThreadingConfig suggested for this platform + */ +ThreadingConfig getDefaultThreadingConfig() { + // TODO(apeskov): have to implement CPU/iOS version check. + // meanwhile will use {0, 2} stub to utilize big cores of A13/A14 CPU. + return {0, 2}; +} + +/** + * Main entry point to BNNS runtime + */ +class BNNSJSONRuntime : public JSONRuntimeBase { + public: + BNNSJSONRuntime(const std::string& symbol_name, const std::string& graph_json, + const Array const_names) + : JSONRuntimeBase(symbol_name, graph_json, const_names) {} + + const char* type_key() const override { return "bnns_json"; } + + void Init(const Array& consts) override { + ICHECK_EQ(consts.size(), const_idx_.size()) + << "The number of input constants must match the number of required."; + + SetupConstants(consts); + BindInputsAndOutputs(); + AllocateIntermediateTensors(); + BuildEngine(); + } + + void Run() override { + // Wrap external handler into BNNS tensor representation + auto bind_ext_hdl_to_tensor = [this](uint32_t eid) { + const auto& ext_dlt = *data_entry_[eid]; + auto& bnns_tensor = tensors_eid_[eid]; + bnns_tensor->set_data_hdl(ext_dlt.data); + }; + + // Bind all input/output external data object into internal abstractions + for (const auto& eid : input_var_eid_) bind_ext_hdl_to_tensor(eid); + for (const auto& out_entity : outputs_) bind_ext_hdl_to_tensor(EntryID(out_entity)); + + // Invoke primitives in topological order + for (const auto& prim : primitives_) prim->execute(); + } + + private: + /** Make corresponding input/output tensor stubs */ + void BindInputsAndOutputs() { + tensors_eid_.resize(data_entry_.size()); + auto createTensor = [&](JSONGraphNodeEntry entry) { + auto node = nodes_[entry.id_]; + auto dlshape = node.GetOpShape()[entry.index_]; + auto dltype = node.GetOpDataType()[entry.index_]; + void* data = nullptr; + if (data_entry_[entry.id_] != nullptr) data = data_entry_[entry.id_]->data; + tensors_eid_[entry.id_] = std::make_shared( + BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), data); + }; + + for (auto& id : input_nodes_) { + auto eid = JSONGraphNodeEntry(id, 0); + createTensor(eid); + } + + for (auto entry : outputs_) { + createTensor(entry); + } + } + + /** Allocate intermediate tensors */ + void AllocateIntermediateTensors() { + for (int i = 0; i < nodes_.size(); ++i) { + auto eid = JSONGraphNodeEntry(i, 0); + if (tensors_eid_[eid.id_] != nullptr) continue; + auto node = nodes_[i]; + auto dlshape = node.GetOpShape()[0]; + auto dltype = node.GetOpDataType()[0]; + tensors_eid_[eid.id_] = std::make_shared( + BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), nullptr); + tensors_eid_[eid.id_]->allocate_memory(); + } + } + + // Build up the engine based on the input graph. + void BuildEngine() { + // Build subgraph engine. + for (size_t nid = 0; nid < nodes_.size(); ++nid) { + const auto& node = nodes_[nid]; + if (node.GetOpType() == "kernel") { + ICHECK_EQ(node.GetOpType(), "kernel"); + auto op_name = node.GetOpName(); + if ("nn.conv2d" == op_name) { + Conv2d(nid); + } else if ("bnns.conv2d_relu" == op_name) { + Conv2d(nid, false, "relu"); + } else if ("bnns.conv2d_bias_relu" == op_name) { + Conv2d(nid, true, "relu"); + } else if ("bnns.conv2d_sigmoid" == op_name) { + Conv2d(nid, false, "sigmoid"); + } else if ("bnns.conv2d_bias_sigmoid" == op_name) { + Conv2d(nid, true, "sigmoid"); + } else if ("bnns.conv2d_bias" == op_name) { + Conv2d(nid, true); + } else if ("nn.dense" == op_name) { + Dense(nid); + } else if ("bnns.dense_bias" == op_name) { + Dense(nid, true); + } else if ("bnns.dense_bias_gelu" == op_name) { + Dense(nid, true, true); + } else if ("nn.batch_matmul" == op_name) { + MatMul(nid); + } else if ("nn.instance_norm" == op_name) { + InstanceNormalization(nid); + } else if ("nn.max_pool2d" == op_name) { + Pooling(nid, false); + } else if ("nn.avg_pool2d" == op_name) { + Pooling(nid, true); + } else if ("nn.global_max_pool2d" == op_name) { + Pooling(nid, false, true); + } else if ("nn.global_avg_pool2d" == op_name) { + Pooling(nid, true, true); + } else { + LOG(FATAL) << "Unsupported op: " << op_name; + } + } + } + } + + // Get BNNS tensor. + std::shared_ptr GetBNNSTensor(const JSONGraphNodeEntry& entry) { + auto eid = EntryID(entry); + ICHECK(eid < tensors_eid_.size()); + return tensors_eid_[eid]; + } + + void Conv2d(const size_t& nid, const bool has_bias = false, + const std::string activation_type = "none") { + auto node = nodes_[nid]; + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto wgh_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + auto dl_input_shape = nodes_[src_entry.id_].GetOpShape()[src_entry.index_]; + auto dl_weight_shape = nodes_[wgh_entry.id_].GetOpShape()[wgh_entry.index_]; + BNNS::Shape input_shape{dl_input_shape.begin(), dl_input_shape.end()}; + BNNS::Shape weight_shape{dl_weight_shape.begin(), dl_weight_shape.end()}; + std::vector str_strides = node.GetAttr>("strides"); + std::vector str_dilation = node.GetAttr>("dilation"); + std::vector str_padding = node.GetAttr>("padding"); + BNNS::Dim groups = std::stoi(node.GetAttr>("groups")[0]); + + BNNS::Dim PH_L = std::stoi(str_padding[0]), // height padding: left + PH_R = std::stoi(str_padding[2]), // height padding: right + PW_L = std::stoi(str_padding[1]), // width padding: left + PW_R = std::stoi(str_padding[3]), // width padding: right + SH = std::stoi(str_strides[0]), // height-wise stride + SW = std::stoi(str_strides[1]), // weight-wise stride + DH = std::stoi(str_dilation[0]), // height kernel dilation + DW = std::stoi(str_dilation[1]); // width kernel dilation + + // Memory descriptions. + const auto& src_t = GetBNNSTensor(src_entry); + const auto& wgh_t = GetBNNSTensor(wgh_entry); + const auto& dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutConvolutionWeightsOIHW); + auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + TView bias_view; + + if (has_bias) { + auto bias_entry = node.GetInputs()[2]; + + auto bias_t = GetBNNSTensor(bias_entry); + bias_view = TView::as_is(bias_t).squeeze().with_layout(BNNSDataLayoutVector); + } + + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + if (activation_type == "relu") + activation = {BNNSActivationFunctionRectifiedLinear}; + else if (activation_type == "sigmoid") + activation = {BNNSActivationFunctionSigmoid}; + + BNNSLayerParametersConvolution conv_param = { + src_view.get_bnns_view(), + wgh_view.get_bnns_view(), + dst_view.get_bnns_view(), + bias_view.get_bnns_view(), + activation, + SW, /* x_stride */ + SH, /* y_stride */ + DW, /* x_dilation_stride */ + DH, /* y_dilation_stride */ + 0, /* x_padding, explicit pads will be used */ + 0, /* y_padding, explicit pads will be used */ + groups, /* groups */ + {PW_L, PW_R, PH_L, PH_R} /* explicit pad values */ + }; + + size_t num_sub_prim = default_thread_config.externalConcurrency; + std::vector params; + std::tie(params, src_view, dst_view) = + split_to_n(num_sub_prim, conv_param, src_view, wgh_view, bias_view, dst_view); + + std::vector filters(params.size(), nullptr); + for (int i = 0; i < params.size(); i++) { + auto common_filter_param = getCommonFilterParams(); + filters[i] = BNNSFilterCreateLayerConvolution(¶ms[i], &common_filter_param); + ICHECK(filters[i]) << "BNNS primitive was not created. Unsupported attributes configuration"; + } + + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void Dense(const size_t& nid, const bool has_bias = false, const bool has_gelu = false) { + auto node = nodes_[nid]; + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto weight_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto wgh_t = GetBNNSTensor(weight_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutVector); + auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutRowMajorMatrix); + auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutVector); + + TView bias_view; + if (has_bias) { + auto bias_entry = node.GetInputs()[2]; + auto bias_md = GetBNNSTensor(bias_entry); + bias_view = TView::as_is(bias_md).with_layout(BNNSDataLayoutVector); + } + + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + if (has_gelu) { + activation = {BNNSActivationFunctionGELUApproximation}; + activation.alpha = std::sqrt(2.0 / M_PI); + activation.beta = 0.044715; + } + + BNNSLayerParametersFullyConnected layerParameters = { + src_view.get_bnns_view(), + wgh_view.get_bnns_view(), + dst_view.get_bnns_view(), + bias_view.get_bnns_view(), + activation, + }; + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerFullyConnected(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + std::vector filters = {filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void MatMul(const size_t& nid) { + auto node = nodes_[nid]; + + // Setup attributes. + auto a_entry = node.GetInputs()[0]; + auto b_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + bool a_is_weighted = data_entry_[EntryID(a_entry)] != nullptr; + bool b_is_weighted = data_entry_[EntryID(b_entry)] != nullptr; + + // Memory descriptions. + auto a_t = GetBNNSTensor(a_entry); + auto b_t = GetBNNSTensor(b_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto a_view = TView::as_is(a_t); + auto b_view = TView::as_is(b_t); + auto dst_view = TView::as_is(dst_t); + + BNNSLayerParametersBroadcastMatMul layerParameters = {1, // alpha + 0, // beta + false, // transA + true, // transB + false, // quadratic + a_is_weighted, + b_is_weighted, + a_view.get_bnns_view(), + b_view.get_bnns_view(), + dst_view.get_bnns_view()}; + + // BNNS limitation: MatMul use reverse dims values. However strides are calculated correctly + // based on BNNSNDArrayDescriptor::layout value. + std::reverse(layerParameters.iA_desc.size, layerParameters.iA_desc.size + 3); + std::reverse(layerParameters.iB_desc.size, layerParameters.iB_desc.size + 3); + std::reverse(layerParameters.o_desc.size, layerParameters.o_desc.size + 3); + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerBroadcastMatMul(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + if (a_is_weighted || b_is_weighted) { + auto src_view = a_is_weighted ? b_view : a_view; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } else { + primitives_.emplace_back( + std::make_shared(filters, a_view, b_view, dst_view)); + } + } + + void InstanceNormalization(const size_t& nid) { + auto node = nodes_[nid]; + size_t axis = std::stoi(node.GetAttr>("axis")[0]); + float epsilon = std::stof(node.GetAttr>("epsilon")[0]); + bool center = std::stoi(node.GetAttr>("center")[0]); + bool scale = std::stoi(node.GetAttr>("scale")[0]); + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto scale_entry = node.GetInputs()[1]; + auto bias_entry = node.GetInputs()[2]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto scale_t = GetBNNSTensor(scale_entry); + auto bias_t = GetBNNSTensor(bias_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t); + auto dst_view = TView::as_is(dst_t); + size_t src_rank = Tensor::getRank(src_view.get_bnns_view()); + size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view()); + ICHECK_EQ(src_rank, dst_rank); + ICHECK_LE(src_rank, 4); + if (src_rank < 4) { + src_view = src_view.unsqueeze(4); + dst_view = dst_view.unsqueeze(4); + } + src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + auto scale_view = TView::as_is(scale_t).with_layout(BNNSDataLayoutVector); + auto bias_view = TView::as_is(bias_t).with_layout(BNNSDataLayoutVector); + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + + auto b_desc = bias_view.get_bnns_view(); + if (!center) b_desc = {}; + auto s_desc = scale_view.get_bnns_view(); + if (!scale) s_desc = {}; + + // NOTE: Axis option is ignored in BNNS. The result doesn't depends on value of axis. + BNNSLayerParametersNormalization layerParameters = {src_view.get_bnns_view(), // i_desc + dst_view.get_bnns_view(), // o_desc + b_desc, // beta_desc + s_desc, // gamma_desc + {}, // moving_mean_desc + {}, // moving_variance_desc + 1.f, // momentum + epsilon, // epsilon + activation, // activation + 1, // num_groups + axis}; // normalization_axis + + BNNSFilterType filter_type = BNNSInstanceNorm; + auto common_filter_param = getCommonFilterParams(); + auto filter = + BNNSFilterCreateLayerNormalization(filter_type, &layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void Pooling(const size_t& nid, bool avg_pooling, bool global = false) { + auto node = nodes_[nid]; + + auto src_entry = node.GetInputs()[0]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t); + auto dst_view = TView::as_is(dst_t); + size_t src_rank = Tensor::getRank(src_view.get_bnns_view()); + size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view()); + ICHECK_EQ(src_rank, dst_rank); + ICHECK_LE(src_rank, 4); + if (src_rank < 4) { + src_view = src_view.unsqueeze(4); + dst_view = dst_view.unsqueeze(4); + } + src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + BNNSPoolingFunction pf = {BNNSPoolingFunctionMax}; + if (avg_pooling) pf = {BNNSPoolingFunctionAverageCountExcludePadding}; + + // Setup attributes. + size_t k_height = 0; + size_t k_width = 0; + size_t y_padding = 0; + size_t x_padding = 0; + size_t y_stride = 1; + size_t x_stride = 1; + if (!global) { + std::vector pool_size = node.GetAttr>("pool_size"); + std::vector padding = node.GetAttr>("padding"); + std::vector strides = node.GetAttr>("strides"); + k_height = std::stoi(pool_size[0]); + k_width = std::stoi(pool_size[1]); + y_padding = std::stoi(padding[0]); + x_padding = std::stoi(padding[1]); + y_stride = std::stoi(strides[0]); + x_stride = std::stoi(strides[1]); + } else { + auto sv = src_view.get_bnns_view(); + k_height = sv.size[1]; + k_width = sv.size[0]; + } + + BNNSLayerParametersPooling layerParameters = {src_view.get_bnns_view(), // i_desc + dst_view.get_bnns_view(), // o_desc + {}, // bias + activation, // activation + pf, // pooling_function + k_width, // k_width + k_height, // k_height + x_stride, // x_stride + y_stride, // y_stride + 0, // x_dilation_stride + 0, // y_dilation_stride + x_padding, // x_padding + y_padding, // y_padding + {}}; // pad left, right, up, down padding + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerPooling(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + BNNS::Dtype convertToBNNS(const DLDataType& dl_dtype) { + if (dl_dtype.code == DLDataTypeCode::kDLFloat) { + if (dl_dtype.bits == 32) return BNNSDataTypeFloat32; + if (dl_dtype.bits == 16) return BNNSDataTypeFloat16; + } + if (dl_dtype.code == DLDataTypeCode::kDLInt) { + if (dl_dtype.bits == 32) return BNNSDataTypeInt32; + if (dl_dtype.bits == 16) return BNNSDataTypeInt16; + if (dl_dtype.bits == 8) return BNNSDataTypeInt8; + } + if (dl_dtype.code == DLDataTypeCode::kDLUInt) { + if (dl_dtype.bits == 32) return BNNSDataTypeUInt32; + if (dl_dtype.bits == 16) return BNNSDataTypeUInt16; + if (dl_dtype.bits == 8) return BNNSDataTypeUInt8; + } + LOG(FATAL) << "Unsupported data type for BNNS runtime"; + return BNNS::Dtype(0); + } + + BNNSFilterParameters getCommonFilterParams() { + // NOTE: To force weights tensor copy on stage of filter create + // just change : BNNSFlagsUseClientPtr -> 0 + return {BNNSFlagsUseClientPtr, default_thread_config.internalConcurrency}; + } + + /** Default threading config. Should be used if there are + * no other threading specificator. */ + const ThreadingConfig default_thread_config = getDefaultThreadingConfig(); + + /** Collection of all primitives in topological order */ + std::vector> primitives_; + + /** Vector with BNNS tensors. Index of tensor matched with + * corresponding EntryID from base JSONRuntimeBase. */ + std::vector tensors_eid_; +}; + +runtime::Module BNNSJSONRuntimeCreate(String symbol_name, String graph_json, + const Array& const_names) { + auto n = make_object(symbol_name, graph_json, const_names); + return runtime::Module(n); +} + +TVM_REGISTER_GLOBAL("runtime.BNNSJSONRuntimeCreate").set_body_typed(BNNSJSONRuntimeCreate); + +TVM_REGISTER_GLOBAL("runtime.module.loadbinary_bnns_json") + .set_body_typed(BNNSJSONRuntime::LoadFromBinary); + +} // namespace contrib +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h new file mode 100644 index 0000000000000..b31e97e554dab --- /dev/null +++ b/src/runtime/contrib/bnns/bnns_wrp.h @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * \file + * \brief C++ wrappers and helpers to handle BNNS objects + */ + +#ifndef TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ +#define TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ + +#include + +#include +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace contrib { +namespace BNNS { + +using Dim = size_t; +using Shape = std::vector; +using Dtype = BNNSDataType; +using HDL = void*; + +void* default_alloc(size_t size) { return malloc(size); } + +void default_free(void* ptr) { free(ptr); } + +/** + * Main abstraction for tensor representation + * + * Contains buffer handler and common attributes like shape and dtype. + */ +class Tensor { + public: + Tensor() = delete; + Tensor(Tensor&) = delete; + + Tensor(Shape shape, Dtype dtype, void* hdl) { + auto rank = shape.size(); + ICHECK(rank < BNNS_MAX_TENSOR_DIMENSION); + + desc_ = {BNNSNDArrayFlags(0), + getPlainLayout(rank), + {}, // shape + {}, // strides + hdl, // data handler + dtype, // data type + nullptr, // table_data (clustering case), is not used + dtype, + 1.f, + 0.f}; + std::copy(shape.rbegin(), shape.rend(), std::begin(desc_.size)); + + desc_.data = hdl; + is_external_data = true; + } + + ~Tensor() { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + desc_.data = nullptr; + } + } + + void allocate_memory() { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + } + const size_t buff_size = getSize(desc_) * getElementSize(desc_); + desc_.data = default_alloc(buff_size); + ICHECK(desc_.data); + is_external_data = false; + } + + void* get_data_hdl() const { return desc_.data; } + + void set_data_hdl(void* hdl) { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + desc_.data = nullptr; + } + + desc_.data = hdl; + is_external_data = true; + } + + const BNNSNDArrayDescriptor& get_desc() const { return desc_; } + + static BNNSDataLayout getPlainLayout(size_t rank) { + ICHECK(rank <= BNNS_MAX_TENSOR_DIMENSION); + return static_cast((rank << 16) | 0x8001); + } + + static size_t getRank(BNNSDataLayout layout) { return (layout & 0xF0000) >> 16; } + + static size_t getRank(BNNSNDArrayDescriptor desc) { return getRank(desc.layout); } + + static size_t getSize(BNNSNDArrayDescriptor desc) { + auto rank = getRank(desc); + return std::accumulate(desc.size, desc.size + rank, 1, std::multiplies()); + } + + /** return size of element in bytes */ + static size_t getElementSize(Dtype dtype) { return (dtype & 0xFFFF) / 8; } + + /** return size of element in bytes */ + static size_t getElementSize(const BNNSNDArrayDescriptor& desc) { + return getElementSize(desc.data_type); + } + + private: + bool is_external_data = false; + BNNSNDArrayDescriptor desc_; +}; + +using TensorPtr = std::shared_ptr; + +/** + * Tensor View object which represent how provided BNNS::Tensor will be considered + * + * The single BNNS::Tensor can be treated in different form depend on particular primitive + * expectation. More other some primitive supports only external form of batching. So we have + * some abstraction to describe how primitive will handle provided tensor. + * + * Batched View + * View with extracted dimension as external batch value + * example: Tensor [2, 3, 224, 224] -> View [3, 224, 224] with ext batch 2 + * + * Party View + * The collection of view on the same tensor, can be the same view or with some stride + * example: Tensor [6, 5, 3, 3] -> 3 x View [2, 5, 3, 3] with stride 45 + */ +class TView { + public: + /** Make view on provided tensor as is */ + static TView as_is(const TensorPtr& origin) { + TView res; + res.origin_ = origin; + res.view_desc_ = origin->get_desc(); + return res; + } + + /** Extract outer dimension to separate batch field. TView will became batched view */ + TView extract_outer_dim() const { + auto rank = Tensor::getRank(view_desc_); + TView res = *this; + res.batch_size_ = view_desc_.size[rank - 1]; + res.batch_stride_ = + std::accumulate(view_desc_.size, view_desc_.size + rank - 1, 1, std::multiplies<>()); + res.view_desc_.size[rank - 1] = 0; + res.view_desc_.layout = Tensor::getPlainLayout(rank - 1); + return res; + } + + /** Squeeze all dims equal 1 */ + TView squeeze(size_t min_rank = 1) const { + auto rank = Tensor::getRank(view_desc_); + size_t squeezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {}; + size_t squeezed_rank = 0; + for (int i = 0; i < rank; i++) + if (view_desc_.size[i] != 1) squeezed_shape[squeezed_rank++] = view_desc_.size[i]; + + if (min_rank > squeezed_rank) { + std::fill(squeezed_shape + squeezed_rank, squeezed_shape + min_rank, 1); + squeezed_rank = min_rank; + } + + TView res = *this; + std::copy(squeezed_shape, squeezed_shape + squeezed_rank, res.view_desc_.size); + std::fill(res.view_desc_.size + squeezed_rank, res.view_desc_.size + rank, 0); + res.view_desc_.layout = Tensor::getPlainLayout(squeezed_rank); + return res; + } + + /** Expand the shape of an array */ + TView expand_dims(std::vector axes) const { + auto rank = Tensor::getRank(view_desc_); + TView res = *this; + size_t unsqueezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {}; + size_t unsqueezed_rank = axes.size() + rank; + ICHECK_LE(unsqueezed_rank, BNNS_MAX_TENSOR_DIMENSION); + for (const auto& axis : axes) { + ICHECK_LT(axis, unsqueezed_rank); + unsqueezed_shape[axis] = 1; + } + for (int i = 0, orig_idx = 0; i < unsqueezed_rank; ++i) { + if (unsqueezed_shape[i] == 1) continue; + unsqueezed_shape[i] = view_desc_.size[orig_idx++]; + } + std::copy(unsqueezed_shape, unsqueezed_shape + unsqueezed_rank, res.view_desc_.size); + res.view_desc_.layout = Tensor::getPlainLayout(unsqueezed_rank); + return res; + } + + /** Unsqueeze tensor to a new rank */ + TView unsqueeze(size_t new_rank) const { + ICHECK_LE(new_rank, BNNS_MAX_TENSOR_DIMENSION); + auto rank = Tensor::getRank(view_desc_); + ICHECK_GT(new_rank, rank); + std::vector axes(new_rank - rank); + std::iota(axes.begin(), axes.end(), rank); + return expand_dims(axes); + } + + /** Construct new TView with specified layout if it applicable */ + TView with_layout(BNNSDataLayout layout) const { + ICHECK_EQ(Tensor::getRank(view_desc_), Tensor::getRank(layout)); + + TView res = *this; + res.view_desc_.layout = layout; + return res; + } + + /** Construct party TView by splitting original TView into num parts */ + TView party_split_n(size_t num) const { + ICHECK_EQ(party_size_, 1); + + TView res = *this; + size_t rank = Tensor::getRank(view_desc_); + size_t size = Tensor::getSize(view_desc_); + res.party_size_ = num; + res.party_stride_ = size / num; + + if (res.batch_size_ != 1) { + res.batch_size_ /= num; + } else { + res.view_desc_.size[rank - 1] /= num; + res.batch_stride_ /= num; + } + return res; + } + + /** Construct party TView by duplicating original TView num times */ + TView party_duplicate_n(size_t num) const { + ICHECK_EQ(party_size_, 1); + + TView res = *this; + res.party_size_ = num; + res.party_stride_ = 0; + + return res; + } + + /** Return data buffer handler */ + HDL get_data_hdl() const { return view_desc_.data; } + + /** Return external batch dimension value */ + size_t get_batch_size() const { return batch_size_; } + + /** Return external batch dimension stride */ + size_t get_stride() const { return batch_stride_; } + + /** Return party element by index */ + TView operator[](size_t i) const { + ICHECK_LT(i, party_size_); + + TView res = *this; + res.party_size_ = 1; + if (origin_) { + auto hdl = reinterpret_cast(origin_->get_data_hdl()); + hdl += i * party_stride_ * Tensor::getElementSize(view_desc_.data_type); + res.view_desc_.data = hdl; + } + return res; + } + + /** Check if view is empty and doesn't relay to any tensor */ + operator bool() const { return origin_ != nullptr; } + + /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */ + const BNNSNDArrayDescriptor& get_bnns_view() const { return view_desc_; } + + private: + /** Original tensor object to view on */ + TensorPtr origin_; + + /** Batched view parameters */ + BNNSNDArrayDescriptor view_desc_ = {}; + size_t batch_size_ = 1; + size_t batch_stride_ = 0; + + /** Party representation parameters */ + size_t party_size_ = 1; + size_t party_stride_ = 0; +}; + +/** + * Wrapper on top of BNNSFilter and src/dst TensorView. + * + * Support decomposed representation of filter and can execute sub primitives in parallel. + */ +class Primitive { + public: + Primitive(const std::vector fs, const TView& src, const TView& dst) + : filters(fs), src_view(src), dst_view(dst) {} + + virtual ~Primitive() { + for (auto& filter : filters) + if (filter) { + BNNSFilterDestroy(filter); + filter = nullptr; + } + } + + /** Execute primitive with using specified src/dst */ + void execute() { + auto res = TVMBackendParallelLaunch(run_task, this, filters.size()); + ICHECK_EQ(res, 0) << "BNNS runtime. Primitive was not executed properly"; + } + + private: + virtual int execute_impl(int part_idx) { + const auto filter = this->filters[part_idx]; + const auto src_view = this->src_view[part_idx]; + const auto dst_view = this->dst_view[part_idx]; + + size_t mb = src_view.get_batch_size(); + + // NB! BNNS limitations + // * Do not use simple BNNSFilterApply. There is a bug inside BNNS, + // BNNSFilterApply doesn't work for grouped convolution. + // * Group convolution doesn't support arbitrary stride for Batch dim. + // The tensor should be dense. + return BNNSFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride()); + } + + static int run_task(int task_id, TVMParallelGroupEnv* penv, void* cdata) { + auto prim = reinterpret_cast(cdata); + return prim->execute_impl(task_id); + } + + protected: + /** BNNS kernels/filters collect which will execute primitive */ + std::vector filters = {}; + const TView src_view; + const TView dst_view; +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing primitive with two inputs. + */ +class TwoInputPrimitive : public Primitive { + public: + TwoInputPrimitive(const std::vector fs, const TView& src, const TView& src2, + const TView& dst) + : Primitive(fs, src, dst), src2_view(src2) {} + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto src2_view = this->src2_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + + return BNNSFilterApplyTwoInputBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + src2_view.get_data_hdl(), src2_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride()); + } + + protected: + const TView src2_view; +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing normalization filter + */ +class NormPrimitive : public Primitive { + public: + using Primitive::Primitive; + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + return BNNSNormalizationFilterApplyBatch(filter, mb, src_view.get_data_hdl(), + src_view.get_stride(), dst_view.get_data_hdl(), + dst_view.get_stride(), false); + } +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing pooling filter + */ +class PoolingPrimitive : public Primitive { + public: + using Primitive::Primitive; + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + return BNNSPoolingFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride(), nullptr, 0); + } +}; + +/** + * Function which split primitive into sub primitives to parallel execution + * + * @param num requested num of sub primitives + * @param orig_conv_param original convolution descriptor + * @param src_view source tensor view + * @param wgh_view weight tensor view + * @param b_view bias tensor view + * @param dst_view destination tensor view + * @param num number of part to split into + * @return collection of Convolution descriptors plus corresponding src/dst tensors view + */ +static std::tuple, TView, TView> split_to_n( + size_t num, const BNNSLayerParametersConvolution& orig_conv_param, const TView& src_view, + const TView& wgh_view, const TView& b_view, const TView& dst_view) { + size_t batch = src_view.get_batch_size(); + size_t oc = dst_view.get_bnns_view().size[2]; + size_t groups = orig_conv_param.groups; + + BNNS::TView src_view_new; + BNNS::TView wgh_view_new; + BNNS::TView b_view_new; + BNNS::TView dst_view_new; + + // TODO(apeskov): Add split by batch dim. Meanwhile we just disable it... + if (batch > 1 || oc % num != 0 || (groups > 1 && groups % num != 0)) { + return {{orig_conv_param}, src_view, dst_view}; + } + + // if groups > 1 split only by groups + // otherwise split inside one convolution by output channels + if (groups > 1) { + src_view_new = src_view.party_split_n(num); + groups = groups / num; + } else { + src_view_new = src_view.party_duplicate_n(num); + } + + wgh_view_new = wgh_view.party_split_n(num); + b_view_new = b_view.party_split_n(num); + dst_view_new = dst_view.party_split_n(num); + + std::vector res(num); + for (size_t i = 0; i < num; i++) { + auto& cur = res[i]; + cur = orig_conv_param; + + cur.i_desc = src_view_new[i].get_bnns_view(); + cur.o_desc = dst_view_new[i].get_bnns_view(); + cur.w_desc = wgh_view_new[i].get_bnns_view(); + cur.bias = b_view_new[i].get_bnns_view(); + cur.groups = groups; + } + return {res, src_view_new, dst_view_new}; +} + +} // namespace BNNS +} // namespace contrib +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ diff --git a/tests/cpp/contrib/bnns.cc b/tests/cpp/contrib/bnns.cc new file mode 100644 index 0000000000000..1efd487caff9f --- /dev/null +++ b/tests/cpp/contrib/bnns.cc @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +TEST(PackedFunc, Basic) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + int x = 0; + void* handle = &x; + DLTensor a; + + Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 3); + ICHECK(args.values[0].v_float64 == 1.0); + ICHECK(args.type_codes[0] == kDLFloat); + ICHECK(args.values[1].v_handle == &a); + ICHECK(args.type_codes[1] == kTVMDLTensorHandle); + ICHECK(args.values[2].v_handle == &x); + ICHECK(args.type_codes[2] == kTVMOpaqueHandle); + *rv = Var("a"); + })(1.0, &a, handle); + ICHECK(v->name_hint == "a"); +} + +TEST(PackedFunc, Node) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + Var x; + Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + ICHECK(args[0].IsObjectRef()); + Var b = args[0]; + ICHECK(x.same_as(b)); + *rv = b; + })(x); + ICHECK(t.same_as(x)); +} + +TEST(PackedFunc, NDArray) { + using namespace tvm; + using namespace tvm::runtime; + auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + reinterpret_cast(x->data)[0] = 10.0f; + ICHECK(x.use_count() == 1); + + PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; }); + + NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + NDArray y = args[0]; + DLTensor* ptr = args[0]; + ICHECK(ptr == x.operator->()); + ICHECK(x.same_as(y)); + ICHECK(x.use_count() == 2); + *rv = forward(y); + })(x); + ICHECK(ret.use_count() == 2); + ICHECK(ret.same_as(x)); +} + +TEST(PackedFunc, str) { + using namespace tvm; + using namespace tvm::runtime; + PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + std::string x = args[0]; + ICHECK(x == "hello"); + String y = args[0]; + ICHECK(y == "hello"); + *rv = x; + })("hello"); + + PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + runtime::String s = args[0]; + ICHECK(s == "hello"); + })(runtime::String("hello")); +} + +TEST(PackedFunc, func) { + using namespace tvm; + using namespace tvm::runtime; + PackedFunc addone([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0].operator int() + 1; }); + // function as arguments + int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + PackedFunc f = args[0]; + // TVMArgValue -> Arguments as function + *rv = f(args[1]).operator int(); + })(addone, 1); + ICHECK_EQ(r0, 2); + + int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + // TVMArgValue -> TVMRetValue + *rv = args[1]; + })(2, 100); + ICHECK_EQ(r1, 100); + + int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + // re-assignment + *rv = args[0]; + // TVMRetValue -> Function argument + *rv = addone(args[0].operator PackedFunc()(args[1], 1)); + })(addone, 100); + ICHECK_EQ(r2, 102); +} + +TEST(PackedFunc, Expr) { + using namespace tvm; + using namespace tvm::runtime; + // automatic conversion of int to expr + PackedFunc addone([](TVMArgs args, TVMRetValue* rv) { + PrimExpr x = args[0]; + *rv = x.as()->value + 1; + }); + int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + PackedFunc f = args[0]; + // TVMArgValue -> Arguments as function + *rv = f(args[1]).operator int(); + })(addone, 1); + ICHECK_EQ(r0, 2); +} + +TEST(PackedFunc, Type) { + using namespace tvm; + using namespace tvm::runtime; + auto get_type = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + DataType x = args[0]; + *rv = x; + }); + auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; }); + ICHECK(get_type("int32").operator DataType() == DataType::Int(32)); + ICHECK(get_type("float").operator DataType() == DataType::Float(32)); + ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2)); +} + +TEST(TypedPackedFunc, HighOrder) { + using namespace tvm; + using namespace tvm::runtime; + using Int1Func = TypedPackedFunc; + using Int2Func = TypedPackedFunc; + using BindFunc = TypedPackedFunc; + BindFunc ftyped; + ftyped = [](Int2Func f1, int value) -> Int1Func { + auto binded = [f1, value](int x) { return f1(value, x); }; + Int1Func x(binded); + return x; + }; + auto add = [](int x, int y) { return x + y; }; + ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3); + PackedFunc f = ftyped(Int2Func(add), 1); + ICHECK_EQ(f(3).operator int(), 4); + // call the type erased version. + Int1Func f1 = ftyped.packed()(Int2Func(add), 1); + ICHECK_EQ(f1(3), 4); +} + +TEST(TypedPackedFunc, Deduce) { + using namespace tvm::runtime; + using tvm::runtime::detail::function_signature; + + TypedPackedFunc x; + auto f = [](int x) -> int { return x + 1; }; + std::function y; + + static_assert(std::is_same::FType, int(float)>::value, + "invariant1"); + static_assert(std::is_same::FType, int(int)>::value, + "invariant2"); + static_assert(std::is_same::FType, void(float)>::value, + "invariant3"); +} + +TEST(PackedFunc, ObjectConversion) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + TVMRetValue rv; + auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + // assign null + rv = ObjectRef(); + ICHECK_EQ(rv.type_code(), kTVMNullptr); + + // Can assign NDArray to ret type + rv = x; + ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle); + // Even if we assign base type it still shows as NDArray + rv = ObjectRef(x); + ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle); + // Check convert back + ICHECK(rv.operator NDArray().same_as(x)); + ICHECK(rv.operator ObjectRef().same_as(x)); + ICHECK(!rv.IsObjectRef()); + + auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle); + ICHECK(args[0].operator NDArray().same_as(x)); + ICHECK(args[0].operator ObjectRef().same_as(x)); + ICHECK(args[1].operator ObjectRef().get() == nullptr); + ICHECK(args[1].operator NDArray().get() == nullptr); + ICHECK(args[1].operator Module().get() == nullptr); + ICHECK(args[1].operator Array().get() == nullptr); + ICHECK(!args[0].IsObjectRef()); + }); + pf1(x, ObjectRef()); + pf1(ObjectRef(x), NDArray()); + + // testcases for modules + auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate"); + ICHECK(pf != nullptr); + Module m = (*pf)("", "xyz"); + rv = m; + ICHECK_EQ(rv.type_code(), kTVMModuleHandle); + // Even if we assign base type it still shows as NDArray + rv = ObjectRef(m); + ICHECK_EQ(rv.type_code(), kTVMModuleHandle); + // Check convert back + ICHECK(rv.operator Module().same_as(m)); + ICHECK(rv.operator ObjectRef().same_as(m)); + ICHECK(!rv.IsObjectRef()); + + auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK_EQ(args[0].type_code(), kTVMModuleHandle); + ICHECK(args[0].operator Module().same_as(m)); + ICHECK(args[0].operator ObjectRef().same_as(m)); + ICHECK(args[1].operator ObjectRef().get() == nullptr); + ICHECK(args[1].operator NDArray().get() == nullptr); + ICHECK(args[1].operator Module().get() == nullptr); + ICHECK(!args[0].IsObjectRef()); + }); + pf2(m, ObjectRef()); + pf2(ObjectRef(m), Module()); +} + +TEST(TypedPackedFunc, RValue) { + using namespace tvm; + using namespace tvm::runtime; + { + auto inspect = [](TVMArgs args, TVMRetValue* rv) { + for (int i = 0; i < args.size(); ++i) { + ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg); + } + }; + PackedFunc finspect(inspect); + finspect(tir::Var("x")); + } + { + auto f = [](tir::Var x, bool move) { + if (move) { + ICHECK(x.unique()); + } else { + ICHECK(!x.unique()); + } + ICHECK(x->name_hint == "x"); + return x; + }; + TypedPackedFunc tf(f); + + tir::Var var("x"); + ICHECK(var.unique()); + tf(var, false); + // move the result to the function. + tir::Var ret = tf(std::move(var), true); + ICHECK(!var.defined()); + } + + { + // pass child class. + auto f = [](PrimExpr x, bool move) { + if (move) { + ICHECK(x.unique()); + } else { + ICHECK(!x.unique()); + } + return x; + }; + TypedPackedFunc tf(f); + + tir::Var var("x"); + ICHECK(var.unique()); + tf(var, false); + tf(std::move(var), true); + // auto conversion. + tf(1, true); + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/python/contrib/test_bnns/__init__.py b/tests/python/contrib/test_bnns/__init__.py new file mode 100644 index 0000000000000..724b23f1378b5 --- /dev/null +++ b/tests/python/contrib/test_bnns/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Infrastructure and tests for BNNS""" diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py new file mode 100644 index 0000000000000..0107de54a04f8 --- /dev/null +++ b/tests/python/contrib/test_bnns/infrastructure.py @@ -0,0 +1,330 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from itertools import zip_longest, combinations +import json +import os +import warnings + +import numpy as np + +import tvm +from tvm import relay +from tvm import rpc +from tvm.contrib import graph_runtime +from tvm.relay.op.contrib.bnns import partition_for_bnns +from tvm.contrib import utils +from tvm.autotvm.measure import request_remote +from tvm.relay.analysis import analysis + + +class Device: + """ + Common device configuration for python tests. + + Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file. + This file can be used to override the default configuration here which will attempt to run the BNNS + runtime tests locally if the runtime is available. Changing the configuration will allow these + runtime tests to be offloaded to a remote device with BNNS via a tracker for example. + + Notes + ----- + The test configuration will be loaded once when the the class is created. If the configuration + changes between tests, any changes will not be picked up. + + + Attributes + ---------- + connection_type : str + Details the type of RPC connection to use. Options: + local - Use the local device, + tracker - Connect to a tracker to request a remote device, + remote - Connect to a remote device directly. + host : str + Specify IP address or hostname of remote target. + port : int + Specify port number of remote target. + target : str + The compilation target. + device_key : str + The device key of the remote target. Use when connecting to a remote device via a tracker. + cross_compile : str + Specify path to cross compiler to use when connecting a remote device from a non-arm platform. + """ + + connection_type = "local" + host = "localhost" + port = 9090 + target = "llvm" + device_key = "" + cross_compile = "" + + def __init__(self): + """Keep remote device for lifetime of object.""" + self.device = self._get_remote() + + @classmethod + def _get_remote(cls): + """Get a remote (or local) device to use for testing.""" + if cls.connection_type == "tracker": + device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000) + elif cls.connection_type == "remote": + device = rpc.connect(cls.host, cls.port) + elif cls.connection_type == "local": + device = rpc.LocalSession() + else: + raise ValueError( + "connection_type in test_config.json should be one of: " "local, tracker, remote." + ) + + return device + + @classmethod + def load(cls, file_name): + """Load test config + + Load the test configuration by looking for file_name relative + to the test_bnns directory. + """ + location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + config_file = os.path.join(location, file_name) + if not os.path.exists(config_file): + warnings.warn("Config file doesn't exist, resuming tests with default config.") + return + with open(config_file, mode="r") as config: + test_config = json.load(config) + + cls.connection_type = test_config["connection_type"] + cls.host = test_config["host"] + cls.port = test_config["port"] + cls.target = test_config["target"] + cls.device_key = test_config.get("device_key") or "" + cls.cross_compile = test_config.get("cross_compile") or "" + + +Device.target = "llvm" + + +def skip_runtime_test(): + """Skip test if it requires the runtime and it's not present.""" + # BNNS codegen not present. + if not tvm.get_global_func("relay.ext.bnns", True): + print("Skip because BNNS codegen is not available.") + return True + return False + + +def skip_codegen_test(): + """Skip test if it requires the BNNS codegen and it's not present.""" + if not tvm.get_global_func("relay.ext.bnns", True): + print("Skip because BNNS codegen is not available.") + return True + + +def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0): + """Build module with option to build for BNNS.""" + if isinstance(mod, tvm.relay.expr.Call): + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + if enable_bnns: + mod = partition_for_bnns(mod) + relay.backend.compile_engine.get().clear() + return relay.build(mod, target=target, target_host=target, params=params) + + +def build_and_run( + mod, + inputs, + outputs, + params, + device, + enable_bnns=True, + no_runs=1, + tvm_ops=0, + config=None, +): + """Build and run the relay module.""" + if config is None: + config = {} + + try: + lib = build_module(mod, device.target, params, enable_bnns, tvm_ops) + except Exception as e: + err_msg = "The module could not be built.\n" + if config: + err_msg += f"The test failed with the following parameters: {config}\n" + err_msg += str(e) + raise Exception(err_msg) + + lib = update_lib(lib, device.device, device.cross_compile) + gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0))) + gen_module.set_input(**inputs) + out = [] + for _ in range(no_runs): + gen_module.run() + out.append([gen_module.get_output(i) for i in range(outputs)]) + return out + + +def update_lib(lib, device, cross_compile): + """Export the library to the remote/local device.""" + lib_name = "mod.so" + temp = utils.tempdir() + lib_path = temp.relpath(lib_name) + if cross_compile: + lib.export_library(lib_path, cc=cross_compile) + else: + lib.export_library(lib_path) + device.upload(lib_path) + lib = device.load_module(lib_name) + return lib + + +def extract_bnns_modules(module): + """Get the BNNS module(s) from llvm module.""" + return list(filter(lambda mod: mod.type_key == "bnns_json", module.get_lib().imported_modules)) + + +def verify(answers, atol, rtol, verify_saturation=False, config=None): + """Compare the array of answers. Each entry is a list of outputs.""" + if config is None: + config = {} + + if len(answers) < 2: + raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}") + for answer in zip_longest(*answers): + for outs in combinations(answer, 2): + try: + if verify_saturation: + assert ( + np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size + ), "Output is saturated: {}".format(outs[0]) + assert ( + np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size + ), "Output is saturated: {}".format(outs[0]) + tvm.testing.assert_allclose( + outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol + ) + except AssertionError as e: + err_msg = "Results not within the acceptable tolerance.\n" + if config: + err_msg += f"The test failed with the following parameters: {config}\n" + err_msg += str(e) + raise AssertionError(err_msg) + + +def verify_codegen( + module, + known_good_codegen, + num_bnns_modules, + tvm_ops=0, + target=Device.target, +): + """Check BNNS codegen against a known good output.""" + module = build_module(module, target, tvm_ops=tvm_ops) + bnns_modules = extract_bnns_modules(module) + + assert len(bnns_modules) == num_bnns_modules, ( + f"The number of BNNS modules produced ({len(bnns_modules)}) does not " + f"match the expected value ({num_bnns_modules})." + ) + + for mod in bnns_modules: + source = mod.get_source("json") + codegen = json.loads(source)["nodes"] + # remove input and const names as these cannot be predetermined + for node in range(len(codegen)): + if codegen[node]["op"] == "input" or codegen[node]["op"] == "const": + codegen[node]["name"] = "" + codegen_str = json.dumps(codegen, sort_keys=True, indent=2) + known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2) + + assert codegen_str == known_good_codegen_str, ( + f"The JSON produced by codegen does not match the expected result. \n" + f"Actual={codegen_str} \n" + f"Expected={known_good_codegen_str}" + ) + + +def compare_inference_with_ref(func, params, atol=0.002, rtol=0.007): + """Compare scoring results for compilation with and without BNNS. + + Provided function will be compiled two times with and without BNNS. + The scoring results for both type of compilation will be compared + with provided atol and rtol. The input data will be automatically + generated based of shape and dtype info provided for var nodes. + + """ + # Generate input tensor values + inputs = {} + for free_param in analysis.free_vars(func): + name = free_param.name_hint + dtype = free_param.type_annotation.dtype + shape = [s.value for s in free_param.type_annotation.shape] + inputs[name] = tvm.nd.array(np.random.uniform(0, 127, shape).astype(dtype)) + + # Run for both type of compilation + device = Device() + outputs = [] + for bnns in [False, True]: + outputs.append(build_and_run(func, inputs, 1, params, device, enable_bnns=bnns)[0]) + + # Compare result tensors + verify(outputs, atol=atol, rtol=rtol) + + +def generate_trials(space, r_factor=3): + """Generates a series of trials. + + This algorithm generates a series of non-deterministic trials given a + space of options to test. A trial is generated by pulling a value from + each option in the space. On some occasions the values are shuffled to + ensure a different trial on each r_factor iteration. The algorithm ensures + that each value from an option is used at least once. The total number of + trials is determined by the r_factor * the option with the largest number + of values. + + Parameters + ---------- + space: List[List[Any]] + A list of different options with varying values to test. + r_factor: Optional[int] + The repeat factor. + + Returns + ------- + result: List[Tuple] + A list of trials specifying values for each option. + + """ + np.random.seed(0) + max_len = 1 + for option in space: + max_len = max(max_len, len(option)) + + num_trials = r_factor * max_len + trials = [] + for i in range(num_trials): + trial = [] + for option in space: + if i % len(option) == 0: + np.random.shuffle(option) + trial.append(option[i % len(option)]) + + trials.append(trial) + + return trials diff --git a/tests/python/contrib/test_bnns/test_conv2d.py b/tests/python/contrib/test_bnns/test_conv2d.py new file mode 100644 index 0000000000000..886958cf30760 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_conv2d.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration conv2d tests.""" + +import numpy as np +import pytest +import tvm +from tvm import relay + +from .infrastructure import skip_runtime_test, compare_inference_with_ref, generate_trials + +# TODO: Missed cases +# 1. Bias as add with 3d const tensor. Lead to additional unsqueeze op between +# 2. Check unsupported cases of fusion. Like bias add with axis != 1, add with broadcast by spatial dims +# 3. Check if bias/weights is not constants. Should fallback into LLVM or decompose it +# 4. Check if bias/weights is constants expr. Should works somehow. + + +def _get_model( + shape, + kernel=(3, 3), + padding=(1, 1), + strides=(1, 1), + dilation=(1, 1), + groups=1, + dtype="float32", + channels=-1, # -1 means same as input channels + bias_type="none", + activation_type="none", +): + """Return a model and any parameters it may have""" + if channels == -1: + channels = shape[1] + + a = relay.var("a", shape=shape, dtype=dtype) + weight_shape = (channels, shape[1] // groups, *kernel) + w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) + weights = relay.const(w, dtype) + out = relay.nn.conv2d( + a, + weights, + kernel_size=kernel, + dilation=dilation, + strides=strides, + padding=padding, + groups=groups, + channels=channels, + out_dtype=dtype, + ) + params = {"w": w} + if bias_type == "bias_add": + b = tvm.nd.array(np.random.uniform(-10, 10, weight_shape[0]).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.nn.bias_add(out, biasc, axis=1) + params["b"] = b + elif bias_type == "add_3d" or bias_type == "add_4d": + bias_shape = ( + (weight_shape[0], 1, 1) if bias_type == "add_3d" else (1, weight_shape[0], 1, 1) + ) + b = tvm.nd.array(np.random.uniform(-10, 10, bias_shape).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.add(out, biasc) + params["b"] = b + + if activation_type == "relu": + out = relay.nn.relu(out) + elif activation_type == "sigmoid": + out = relay.op.sigmoid(out) + return out, params + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d(): + np.random.seed(0) + + kernel_hs = [1, 2, 3, 5] + kernel_ws = [1, 2, 3, 5] + pad = [(1, 1), (2, 2), (2, 1)] + strides = [(1, 1), (2, 2)] + dilation = [(1, 1)] + out_channels = [1, 4, 8, 16] + input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] + batches = [1, 2] + groups = [1, 2] + bias_kind = ["none", "add_3d", "add_4d", "bias.add"] + activation_kind = ["none", "relu", "sigmoid"] + trials = generate_trials( + [ + kernel_hs, + kernel_ws, + pad, + strides, + dilation, + out_channels, + input_shapes, + groups, + batches, + bias_kind, + activation_kind, + ], + 3, + ) + + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + input_shapes, + group, + batch, + bias, + activation, + ) in trials: + if out_channels % group != 0: + continue + func, params = _get_model( + shape=(batch, *input_shapes), + kernel=(kernel_h, kernel_w), + padding=pad, + strides=stride, + dilation=dilation, + groups=group, + channels=out_channels, + bias_type=bias, + activation_type=activation, + ) + compare_inference_with_ref(func, params) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d_dw(): + if skip_runtime_test(): + return + + np.random.seed(0) + shape = [4, 5, 5] + + for batch in [1, 2]: + mod, params = _get_model(shape=(batch, *shape), groups=shape[0]) + compare_inference_with_ref(mod, params) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d_with_oc1(): + if skip_runtime_test(): + return + + np.random.seed(0) + shape = [3, 5, 5] + + for batch in [1, 2]: + for bias in ["none", "add_4d"]: + mod, params = _get_model(shape=(batch, *shape), channels=1, bias_type=bias) + compare_inference_with_ref(mod, params) + + +if __name__ == "__main__": + test_conv2d() + test_conv2d_dw() + test_conv2d_with_oc1() diff --git a/tests/python/contrib/test_bnns/test_conv2d_patterns.py b/tests/python/contrib/test_bnns/test_conv2d_patterns.py new file mode 100644 index 0000000000000..b10504bbc9611 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_conv2d_patterns.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS pattern detection check""" + +import tvm +from tvm import relay +import numpy as np + +from tvm.relay.op.contrib.bnns import partition_for_bnns + +fp32 = "float32" + + +def partition(exp): + """Apply BNNS specific partitioning transformation""" + mod = tvm.IRModule.from_expr(exp) + with tvm.transform.PassContext(opt_level=3): + mod = partition_for_bnns(mod) + return mod + + +def is_op_fused(func, op_name): + is_fused = False + + def visit(op): + if ( + isinstance(op, tvm.relay.function.Function) + and op_name in op.attrs["PartitionedFromPattern"] + ): + nonlocal is_fused + is_fused = True + + tvm.relay.analysis.post_order_visit(func.body, visit) + return is_fused + + +def test_pattern_conv2d_with_bias_add(): + for axis in (1, 2): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.const(np.random.uniform(-10, 10, 8).astype(fp32)) + res = relay.nn.bias_add(res, b, axis=axis) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add") + + assert bias_is_fused if axis == 1 else not bias_is_fused + + +def test_pattern_conv2d_with_add(): + workloads = {8: False, (8, 1): False, (8, 1, 1): True, (1, 8, 1, 1): True} + + for b_shape, should_be_fused in workloads.items(): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.const(np.random.uniform(-10, 10, b_shape).astype(fp32)) + res = relay.add(res, b) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "add") + + assert bias_is_fused == should_be_fused + + +def test_pattern_conv2d_with_non_cons_weights(): + for const_weights in (True, False): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + if const_weights: + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + else: + w = relay.var("w", shape=(8, 7, 3, 3), dtype=fp32) + + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + + mod = partition(res) + use_bnns = len(mod.get_global_vars()) == 2 # GlobalVar: "main" and "bnns_0" + + assert use_bnns == const_weights + + +def test_pattern_conv2d_with_non_cons_bias(): + a = relay.var("a", shape=[2, 7, 8, 8], dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.var("b", shape=[8], dtype=fp32) + res = relay.nn.bias_add(res, b, axis=1) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add") + + assert not bias_is_fused diff --git a/tests/python/contrib/test_bnns/test_dense.py b/tests/python/contrib/test_bnns/test_dense.py new file mode 100644 index 0000000000000..c2cf9bf71373b --- /dev/null +++ b/tests/python/contrib/test_bnns/test_dense.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration dense tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, + generate_trials, +) + + +def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False, has_gelu=False): + """Return a model and any parameters it may have""" + a = relay.var(next(var_names), shape=shape, dtype=dtype) + w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) + weights = relay.const(w, dtype) + out = relay.nn.dense(a, weights, units=units, out_dtype=dtype) + params = {"w": w} + if has_bias: + b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.op.add(out, biasc) + params["b"] = b + if has_gelu: + const1 = relay.const(0.044715) + const2 = relay.const(math.sqrt(2 / math.pi)) + bias = out + out = relay.op.power(bias, relay.const(3.0, "float32")) + out = relay.op.multiply(out, const1) + out = relay.op.add(out, bias) + out = relay.op.multiply(out, const2) + out = relay.op.tanh(out) + out = relay.op.add(out, relay.const(1, "float32")) + out = relay.op.multiply(out, relay.const(0.5)) + out = relay.op.multiply(out, bias) + return out, params + + +def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False, has_gelu=False): + output_shape = (shape[0], units) + name = "nn.dense" + if has_bias is True: + name = "bnns.dense_bias" + if has_bias is True and has_gelu is True: + name = "bnns.dense_bias_gelu" + + node = { + "op": "kernel", + "name": name, + "inputs": [], + "attrs": { + "num_outputs": "1", + "out_dtype": [["float32"]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "units": [[str(units)]], + }, + } + + inputs = [ + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}}, + { + "op": "const", + "name": "", + "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]}, + }, + ] + + if has_bias: + inputs.append( + { + "op": "const", + "name": "", + "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]}, + } + ) + + input_idx = 0 + for _ in range(len(inputs)): + node["inputs"].append([input_idx, 0, 0]) + input_idx += 1 + node["attrs"]["num_inputs"] = str(len(inputs)) + inputs.append(node) + return inputs + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_dense(): + device = Device() + np.random.seed(0) + + dtype = ["float32"] + shape = [ + ((1, 128), (16, 128), 16), + ((32, 32), (32, 32), 32), + ((1, 64), (1, 64), 1), + ((11, 2), (2, 2), 2), + ((2, 2), (1, 2), 1), + ] + composite = [False, True] + trials = generate_trials([dtype, shape, composite, composite], 3) + + for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials: + outputs = [] + inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))} + func, params = _get_model( + shape, + weight_shape, + units, + dtype, + var_names=iter(inputs), + has_bias=with_bias, + has_gelu=with_gelu, + ) + for bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=bnns, + )[0] + ) + + config = { + "shape": shape, + "weight_shape": weight_shape, + "units": units, + "dtype": dtype, + "with_bias": with_bias, + "with_gelu": with_gelu, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_dense(): + np.random.seed(0) + + dtype = ["float32"] + shape = [ + ((1, 128), (16, 128), 16), + ((32, 32), (32, 32), 32), + ((1, 64), (1, 64), 1), + ((11, 2), (2, 2), 2), + ((2, 2), (1, 2), 1), + ] + composite = [False, True] + trials = generate_trials([dtype, shape, composite, composite], 3) + + for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials: + inputs = {"a"} + + args = (shape, weight_shape, units, dtype) + + func, params = _get_model( + *args, var_names=iter(inputs), has_bias=with_bias, has_gelu=with_gelu + ) + exp_codegen = _get_expected_codegen(*args, has_bias=with_bias, has_gelu=with_gelu) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_dense() + test_codegen_dense() diff --git a/tests/python/contrib/test_bnns/test_matmul.py b/tests/python/contrib/test_bnns/test_matmul.py new file mode 100644 index 0000000000000..7bf4d48f8e881 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_matmul.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration dense tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + verify_codegen, + build_and_run, + verify, + generate_trials, +) + + +def _get_model(a_shape, b_shape, dtype, var_names, is_a_constant=False, is_b_constant=False): + """Return a model and any parameters it may have""" + a = relay.var(next(var_names), shape=a_shape, dtype=dtype) + b = relay.var(next(var_names), shape=b_shape, dtype=dtype) + params = {} + if is_b_constant is True: + b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = b + b = relay.const(b, dtype) + if is_a_constant is True: + a = tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)) + params["a"] = a + a = relay.const(a, dtype) + out = relay.nn.batch_matmul(a, b) + return out, params + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_matmul(): + device = Device() + np.random.seed(0) + dtype = "float32" + + # C[N, I, J] = A[N, I, K] * B[N, J, K] + shapes_config = [ + # B, I, J, K + [1, 4, 4, 3], + [1, 16, 32, 32], + [2, 1, 1, 3], + [2, 16, 32, 32], + [5, 1, 1, 3], + ] + data_config = [ + # A_is_constant, B_is_constant + [False, True], + [True, False], + [False, False], + ] + + for N, I, J, K in shapes_config: + a_shape = [N, I, K] + b_shape = [N, J, K] + for is_a_constant, is_b_constant in data_config: + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)), + "b": tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)), + } + func, params = _get_model( + a_shape, + b_shape, + dtype, + var_names=iter(inputs), + is_a_constant=is_a_constant, + is_b_constant=is_b_constant, + ) + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=enable_bnns, + )[0] + ) + + config = { + "a_shape": a_shape, + "b_shape": b_shape, + "dtype": dtype, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +if __name__ == "__main__": + test_matmul() diff --git a/tests/python/contrib/test_bnns/test_normalization.py b/tests/python/contrib/test_bnns/test_normalization.py new file mode 100644 index 0000000000000..094cfb041c3c8 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_normalization.py @@ -0,0 +1,201 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration normalization tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + verify_codegen, + build_and_run, + verify, + generate_trials, +) + + +def _get_model( + shape, b_shape, s_shape, dtype, var_names, axis=1, epsilon=1e-5, center=True, scale=True +): + """Return a model and any parameters it may have""" + src = relay.var(next(var_names), shape=shape, dtype=dtype) + params = {} + b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = b + b = relay.const(b, dtype) + s = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = s + s = relay.const(s, dtype) + out = relay.nn.instance_norm(src, s, b, axis, epsilon, center, scale) + + return out, params + + +def _get_expected_codegen(shape, axis, center, scale, dtype, offload_on_bnns): + output_shape = shape + name = "nn.instance_norm" + + node = { + "op": "kernel", + "name": name, + "inputs": [], + "attrs": { + "num_outputs": "1", + "axis": [[str(axis)]], + "center": [[str(int(center))]], + "scale": [[str(int(scale))]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "epsilon": [["1.0000000000000001e-05"]], + }, + } + + inputs = [ + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}}, + { + "op": "const", + "name": "", + "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]}, + }, + { + "op": "const", + "name": "", + "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]}, + }, + ] + + input_idx = 0 + for _ in range(len(inputs)): + node["inputs"].append([input_idx, 0, 0]) + input_idx += 1 + node["attrs"]["num_inputs"] = str(len(inputs)) + inputs.append(node) + return inputs + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_normalization(): + device = Device() + np.random.seed(0) + dtype = "float32" + + shapes_config = [ + [1, 2, 3, 4], + [3, 2, 3, 4], + [2, 2, 3], + [16, 32, 32], + [5, 3], + ] + axes = [-1, 0, 1, 2] + + for shape in shapes_config: + for axis in axes: + if len(shape) == 2 and axis != 0: + continue + for center in [False, True]: + for scale in [False, True]: + outputs = [] + inputs = { + "src": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)), + } + func, params = _get_model( + shape, + [shape[axis]], + [shape[axis]], + dtype, + var_names=iter(inputs), + axis=axis, + center=center, + scale=scale, + ) + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=enable_bnns, + )[0] + ) + + config = { + "dtype": dtype, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_normalization(): + np.random.seed(0) + + dtype = "float32" + shapes_config = [ + [1, 2, 3, 4], + [3, 2, 3, 4], + [2, 2, 3], + [16, 32, 32], + [5, 3], + ] + axes = [-1, 0, 1, 2] + + def check_normalization(rank, axis): + if rank < 3 or rank > 4: + return False + if axis == 0 and rank == 3 or axis == 1 and rank == 4: + return True + return False + + for shape in shapes_config: + for axis in axes: + if len(shape) == 2 and axis != 0: + continue + for center in [False, True]: + for scale in [False, True]: + inputs = {"src"} + + args = (shape, axis, center, scale, dtype) + + func, params = _get_model( + shape, + [shape[axis]], + [shape[axis]], + dtype, + var_names=iter(inputs), + axis=axis, + center=center, + scale=scale, + ) + + offload_on_bnns = check_normalization(len(shape), axis) + if offload_on_bnns is True: + bnns_blocks = 1 + else: + bnns_blocks = 0 + exp_codegen = _get_expected_codegen(*args, offload_on_bnns) + verify_codegen(func, exp_codegen, bnns_blocks) + + +if __name__ == "__main__": + test_normalization() + test_codegen_normalization() diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py new file mode 100644 index 0000000000000..86f98eb6e8de0 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS pattern detection check""" + +import pytest + +import tvm +from tvm import relay +from tvm.relay import transform +from tvm.contrib import utils, graph_runtime +from tvm.contrib.download import download_testdata +from tvm.relay.op.contrib.bnns import partition_for_bnns + +import numpy as np + +pytest.importorskip("onnx") + +bnns_is_absent = tvm.get_global_func("relay.ext.bnns", True) is None + +TARGET = "llvm" +INPUT_SHAPE = [1, 3, 224, 224] + +BASE_MODEL_URL = "https://github.com/onnx/models/raw/master/" +MODEL_URL_COLLECTION = { + "BERT": "text/machine_comprehension/bert-squad/model/bertsquad-10.onnx", + "MobileNet-v2": "vision/classification/mobilenet/model/mobilenetv2-7.onnx", + "ResNet50-v1": "vision/classification/resnet/model/resnet50-v1-7.onnx", + "ResNet50-v2": "vision/classification/resnet/model/resnet50-v2-7.onnx", + "SqueezeNet-v1.1": "vision/classification/squeezenet/model/squeezenet1.1-7.onnx", + "SqueezeNet-v1.0": "vision/classification/squeezenet/model/squeezenet1.0-7.onnx", + "Inception-v1": "vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx", + "Inception-v2": "vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx", +} + + +def get_onnx_input_name(model): + inputs = [node.name for node in model.graph.input] + initializer = [node.name for node in model.graph.initializer] + + inputs = list(set(inputs) - set(initializer)) + return inputs + + +def get_model_url(model_name): + return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name] + + +def get_name_from_url(url): + return url[url.rfind("/") + 1 :].strip() + + +def find_of_download(model_name): + model_url = get_model_url(model_name) + model_file_name = get_name_from_url(model_url) + return download_testdata(model_url, model_file_name, module="models") + + +def get_model(model_name): + model_path = find_of_download(model_name) + onnx_model = onnx.load(model_path) + input_names = get_onnx_input_name(onnx_model) + input_dict = {} + for name in input_names: + input_dict[name] = INPUT_SHAPE # TODO: hardcode + mod, params = relay.frontend.from_onnx(onnx_model, input_dict, freeze_params=True) + return mod, params, input_dict + + +def simplify_model(mod): + """ + Simplify execution graph + + At least merge BatchNorm into convolution. For this purpose decompose BN primitive + into simple operation which can be calculated as const expr and after that merged + into nearest conv/dense primitive. + """ + seq = tvm.transform.Sequential( + [ + transform.InferType(), + transform.FoldConstant(), + transform.SimplifyInference(), + transform.FoldScaleAxis(), + ] + ) + return seq(mod) + + +def process(model_name): + temp = utils.tempdir() + model, params, input_dict = get_model(model_name) + + def run(mod, target, simplify=True, with_bnns=False): + with tvm.transform.PassContext(opt_level=3): + if simplify: + mod = simplify_model(mod) + if with_bnns: + mod = partition_for_bnns(mod) + graph_module = relay.build(mod, target=target, target_host=target, params=params) + + lib_name = "deploy.tar" + path_dso = temp.relpath(lib_name) + graph_module.export_library(path_dso) + + ctx = tvm.cpu(0) + loaded_lib = tvm.runtime.load_module(path_dso) + + module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + module.run() + return module.get_output(0).asnumpy() + + res_llvm = run(model, TARGET, simplify=True, with_bnns=False) + res_bnns = run(model, TARGET, simplify=True, with_bnns=True) + + tvm.testing.assert_allclose( + res_llvm, + res_bnns, + atol=0.002, + rtol=0.007, + ) + + +@pytest.mark.skip(reason="Manually disabled because of huge complexity") +@pytest.mark.skipif(bnns_is_absent, reason="BNNS runtime is absent") +@pytest.mark.parametrize("model_name", MODEL_URL_COLLECTION.keys()) +def test_topology(model_name): + process(model_name) diff --git a/tests/python/contrib/test_bnns/test_pooling.py b/tests/python/contrib/test_bnns/test_pooling.py new file mode 100644 index 0000000000000..77a78d4bf7e1f --- /dev/null +++ b/tests/python/contrib/test_bnns/test_pooling.py @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration pooling tests.""" + +import numpy as np +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, +) +from .infrastructure import Device + + +def _calculate_output_shape(shape, sizes, padding, strides): + """Calculate pooling output shape.""" + output_height = ((shape[2] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1 + output_width = ((shape[3] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1 + return 1, shape[1], int(output_height), int(output_width) + + +def _get_pooling_model( + shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad, var_names +): + """Return a model and any parameters it may have.""" + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + out = relay.var(next(var_names), shape=shape, dtype=dtype) + + if typef == "nn.max_pool2d": + out = relay.nn.max_pool2d( + out, + pool_size=sizes, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + ) + elif typef == "nn.avg_pool2d": + out = relay.nn.avg_pool2d( + out, + pool_size=sizes, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + else: + raise ValueError("Function not supported") + + return out + + +def _get_global_pooling_model(shape, dtype, typef, var_names): + """Return a model and any parameters it may have.""" + out = relay.var(next(var_names), shape=shape, dtype=dtype) + + if typef == "nn.global_max_pool2d": + out = relay.nn.global_max_pool2d(out) + elif typef == "nn.global_avg_pool2d": + out = relay.nn.global_avg_pool2d(out) + else: + raise ValueError("Function not supported") + + return out + + +def _get_expected_pooling_codegen( + shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad +): + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + output_shape = _calculate_output_shape(shape, sizes, padding, strides) + + node = { + "op": "kernel", + "name": typef, + "inputs": [[0, 0, 0]], + "attrs": { + "num_inputs": "1", + "num_outputs": "1", + "layout": [["NCHW"]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "padding": [[str(p) for p in padding]], + "strides": [[str(s) for s in strides]], + "pool_size": [[str(s) for s in sizes]], + "ceil_mode": [[str(1 if ceil_mode else 0)]], + }, + } + + if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d": + node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]] + + input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + return [input, node] + + +def _get_expected_global_pooling_codegen(shape, dtype, typef): + node = { + "op": "kernel", + "name": typef, + "inputs": [[0, 0, 0]], + "attrs": { + "num_inputs": "1", + "num_outputs": "1", + "layout": [["NCHW"]], + "shape": [[[1, shape[1], 1, 1]]], + "dtype": [[dtype]], + }, + } + + input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + return [input, node] + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_pooling(): + device = Device() + np.random.seed(0) + + dtype = "float32" + trials = [ + ["nn.max_pool2d", (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)], + ] + + for ( + typef, + size, + stride, + pad, + ceil_mode, + count_include_pad, + input_shape, + ) in trials: + shape = (1, *input_shape) + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)), + } + + func = _get_pooling_model( + shape, dtype, typef, size, stride, pad, ceil_mode, count_include_pad, iter(inputs) + ) + + config = { + "size": size, + "stride": stride, + "shape": shape, + "pooling type": typef, + "dtype": dtype, + "padding": pad, + "ceil_mode": ceil_mode, + "count_include_pad": count_include_pad, + "inputs": inputs, + } + + params = None + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, inputs, 1, params, device, enable_bnns=enable_bnns, config=config + )[0] + ) + + verify(outputs, atol=0.001, rtol=0.001, config=config) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_global_pooling(): + device = Device() + np.random.seed(0) + + dtype = "float32" + + trials = [ + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_max_pool2d", (9, 9, 16)], + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (9, 9, 16)], + ] + + for typef, input_shape in trials: + shape = (1, *input_shape) + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)), + } + + func = _get_global_pooling_model(shape, dtype, typef, iter(inputs)) + config = { + "shape": shape, + "pooling type": typef, + "dtype": dtype, + } + + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, inputs, 1, None, device, enable_bnns=enable_bnns, config=config + )[0] + ) + + verify(outputs, atol=0.001, rtol=0.001, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_pooling(): + dtype = "float32" + + trials = [ + ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)], + ] + + for ( + typef, + size, + stride, + pad, + ceil_mode, + count_include_pad, + input_shape, + ) in trials: + shape = (1, *input_shape) + inputs = {"a"} + args = (shape, dtype, typef, size, stride, pad, False, False) + func = _get_pooling_model(*args, iter(inputs)) + exp_codegen = _get_expected_pooling_codegen(*args) + verify_codegen(func, exp_codegen, 1) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_global_pooling(): + dtype = "float32" + + trials = [ + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_max_pool2d", (9, 9, 16)], + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (9, 9, 16)], + ] + + for typef, input_shape in trials: + shape = (1, *input_shape) + inputs = {"a"} + args = (shape, dtype, typef) + func = _get_global_pooling_model(*args, iter(inputs)) + exp_codegen = _get_expected_global_pooling_codegen(*args) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_pooling() + test_global_pooling() + test_codegen_pooling() + test_codegen_global_pooling() From 8d08b21ec47900aef6db0c4b3fb2415776aa3e29 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 11 Mar 2021 18:53:15 -0800 Subject: [PATCH 11/84] [PROFILING] Combine USE_VM_PROFILER and USE_GRAPH_RUNTIME_DEBUG into a single flag USE_PROFILER (#7637) --- CMakeLists.txt | 36 ++++++++++--------- cmake/config.cmake | 7 ++-- tests/scripts/task_config_build_arm.sh | 3 +- tests/scripts/task_config_build_cpu.sh | 3 +- tests/scripts/task_config_build_gpu.sh | 3 +- tests/scripts/task_config_build_gpu_vulkan.sh | 3 +- tests/scripts/task_config_build_i386.sh | 3 +- tests/scripts/task_config_build_wasm.sh | 3 +- 8 files changed, 27 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d2e2bcb68c4b..451b6a7ee2c28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ tvm_option(USE_THREADS "Build with thread support" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) -tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF) +tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON) tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF) tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF) tvm_option(USE_RTTI "Build with RTTI" ON) @@ -262,13 +262,6 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS}) list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS}) list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS}) - -if(USE_VM_PROFILER) - message(STATUS "Build compiler with Relay VM profiler support...") - file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc) - list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS}) -endif(USE_VM_PROFILER) - file(GLOB DATATYPE_SRCS src/target/datatype/*.cc) list(APPEND COMPILER_SRCS ${DATATYPE_SRCS}) list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc") @@ -315,20 +308,29 @@ if(USE_GRAPH_RUNTIME) file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS}) - if(USE_GRAPH_RUNTIME_DEBUG) - message(STATUS "Build with Graph runtime debug support...") - file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc) - list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS}) - set_source_files_properties(${RUNTIME_GRAPH_SRCS} - PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG") - endif(USE_GRAPH_RUNTIME_DEBUG) endif(USE_GRAPH_RUNTIME) +# convert old options for profiler +if(USE_GRAPH_RUNTIME_DEBUG) + unset(USE_GRAPH_RUNTIME_DEBUG CACHE) + set(USE_PROFILER ON) +endif() if(USE_VM_PROFILER) - message(STATUS "Build with Relay VM profiler support...") + unset(USE_VM_PROFILER CACHE) + set(USE_PROFILER ON) +endif() + +if(USE_PROFILER) + message(STATUS "Build with profiler...") + + file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS}) + set_source_files_properties(${RUNTIME_GRAPH_SRCS} + PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG") + file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS}) -endif(USE_VM_PROFILER) +endif(USE_PROFILER) # Module rules include(cmake/modules/VTA.cmake) diff --git a/cmake/config.cmake b/cmake/config.cmake index 67370c6352099..eee07c356ae6b 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -99,11 +99,8 @@ set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph runtime. set(USE_GRAPH_RUNTIME ON) -# Whether enable additional graph debug functions -set(USE_GRAPH_RUNTIME_DEBUG OFF) - -# Whether enable additional vm profiler functions -set(USE_VM_PROFILER OFF) +# Whether to enable the profiler for the graph runtime and vm +set(USE_PROFILER ON) # Whether enable uTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh index 80527466c71e9..b3a084aef3713 100755 --- a/tests/scripts/task_config_build_arm.sh +++ b/tests/scripts/task_config_build_arm.sh @@ -25,10 +25,9 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_RPC ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-8\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index db636063b9e3c..aa5581b0e11ab 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -26,8 +26,7 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake echo set\(USE_LLVM llvm-config-11\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index 155bac80533f0..13dfb4136547b 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -36,8 +36,7 @@ echo set\(USE_RPC ON\) >> config.cmake echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake echo set\(USE_VTA_TSIM ON\) >> config.cmake echo set\(USE_VTA_FSIM ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh index 74096b1a97606..5865dc9699584 100755 --- a/tests/scripts/task_config_build_gpu_vulkan.sh +++ b/tests/scripts/task_config_build_gpu_vulkan.sh @@ -27,7 +27,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake echo set\(USE_ROCM ON\) >> config.cmake echo set\(USE_VULKAN ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh index 68e61c6a039c8..05acbb0221246 100755 --- a/tests/scripts/task_config_build_i386.sh +++ b/tests/scripts/task_config_build_i386.sh @@ -25,10 +25,9 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_RPC ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh index c37a119b05902..78dc7550028bb 100755 --- a/tests/scripts/task_config_build_wasm.sh +++ b/tests/scripts/task_config_build_wasm.sh @@ -26,8 +26,7 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-11\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake From 9d72bd051bdc02ab678d223a794ab6cb607866ba Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 11 Mar 2021 20:25:35 -0800 Subject: [PATCH 12/84] [RUNTIME] Switch time evaluator to use device specific timing. (#7631) --- src/runtime/rpc/rpc_module.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 4f721e122a4cc..34691415c1a47 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -364,8 +365,6 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe if (f_preproc != nullptr) { f_preproc.CallPacked(args, &temp); } - std::chrono::time_point tbegin, - tend; double duration_ms = 0.0; do { @@ -374,20 +373,17 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe number * 1.618)); // 1.618 is chosen by random } - tbegin = std::chrono::high_resolution_clock::now(); + Timer t = Timer::Start(ctx); // start timing for (int i = 0; i < number; ++i) { pf.CallPacked(args, &temp); } - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - tend = std::chrono::high_resolution_clock::now(); - - duration_ms = - std::chrono::duration_cast>(tend - tbegin).count() * 1000; + t->Stop(); + int64_t t_nanos = t->SyncAndGetElapsedNanos(); + duration_ms = t_nanos / 1e6; } while (duration_ms < min_repeat_ms); - double speed = - std::chrono::duration_cast>(tend - tbegin).count() / number; + double speed = duration_ms / 1e3 / number; os.write(reinterpret_cast(&speed), sizeof(speed)); } From 1a506791ba7b7f041de318b6a83c2a7f080ee1cd Mon Sep 17 00:00:00 2001 From: eleflea Date: Fri, 12 Mar 2021 14:22:04 +0800 Subject: [PATCH 13/84] fix missing qparams in aten::upsample_nearest2d (#7646) --- python/tvm/relay/frontend/qnn_torch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index 2b85a1f3a1be2..2dd84b650bd27 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -353,6 +353,7 @@ def add_input_quant_params_to_op_inputs(graph): "quantized::mul": 2, "aten::dequantize": 1, "aten::mean": 1, + "aten::upsample_nearest2d": 1, "aten::upsample_bilinear2d": 1, "aten::relu_": 1, "aten::relu": 1, From b80e777f1cafe61d8886d7567906e9b018788447 Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Fri, 12 Mar 2021 09:39:06 -0800 Subject: [PATCH 14/84] [docs] Getting Started with TVM: Auto Scheduler and matmul (#7644) Moves the auto scheduler with matmul example into the tutorial, expands to follow the flow of the larger getting started tutorial. Indended to follow the AutoTVM tutorial on matrix multiplication. --- docs/conf.py | 5 +- .../tune_matmul_x86.py | 136 ++++++++++-------- 2 files changed, 83 insertions(+), 58 deletions(-) rename tutorials/{auto_scheduler => get_started}/tune_matmul_x86.py (55%) diff --git a/docs/conf.py b/docs/conf.py index ad838f767f806..c9c68706998bd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -210,10 +210,11 @@ # The unlisted files always appear after listed files. within_subsection_order = { "get_started": [ - "relay_quick_start.py", - "tensor_expr_get_started.py", "tvmc_command_line_driver.py", + "tensor_expr_get_started.py", + "autoschedule_matmul.py", "cross_compilation_and_rpc.py", + "relay_quick_start.py", ], "frontend": [ "from_pytorch.py", diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/get_started/tune_matmul_x86.py similarity index 55% rename from tutorials/auto_scheduler/tune_matmul_x86.py rename to tutorials/get_started/tune_matmul_x86.py index 084f5ae67518d..a51f01115f311 100644 --- a/tutorials/auto_scheduler/tune_matmul_x86.py +++ b/tutorials/get_started/tune_matmul_x86.py @@ -15,24 +15,27 @@ # specific language governing permissions and limitations # under the License. """ -Auto-scheduling Matrix Multiplication for CPU -============================================= +Optimizing Operators with Auto-scheduling +========================================= **Author**: `Lianmin Zheng `_, \ `Chengfan Jia `_ -This is a tutorial on how to use the auto-scheduler for CPUs. +In this tutorial, we will show how TVM's Auto Scheduling feature can find +optimal schedules without the need for writing a custom template. -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any templates. -Users only need to write the computation declaration without any schedule commands or templates. -The auto-scheduler can automatically generate a large search space and -find a good schedule in the space. +Different from the template-based :ref:`` which relies on +manual templates to define the search space, the auto-scheduler does not +require any templates. Users only need to write the computation declaration +without any schedule commands or templates. The auto-scheduler can +automatically generate a large search space and find a good schedule in the +space. We use matrix multiplication as an example in this tutorial. -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. +.. note:: + Note that this tutorial will not run on Windows or recent versions of macOS. To + get it to run, you will need to wrap the body of this tutorial in a :code:`if + __name__ == "__main__":` block. """ import os @@ -41,15 +44,18 @@ import tvm from tvm import te, auto_scheduler -###################################################################### -# Define the computation -# ^^^^^^^^^^^^^^^^^^^^^^ -# To begin with, let us define the computation of a matmul with bias add. -# The function should return the list of input/output tensors. -# From these tensors, the auto-scheduler can get the whole computational graph. +################################################################################ +# Defining the Matrix Multiplication +# ---------------------------------- +# To start, we define a matrix multiplication with a bias addition. Note that +# this uses standard operations available in TVMs Tensor Expression language. +# The major difference is the use of the `auto_sceduler` decorator at the top +# of the function definition. The function should return a list of +# input/output tensors. From these tensors, the auto-scheduler can get the +# whole computational graph. -@auto_scheduler.register_workload +@auto_scheduler.register_workload # Note the auto_scheduler decorator def matmul_add(N, L, M, dtype): A = te.placeholder((N, L), name="A", dtype=dtype) B = te.placeholder((L, M), name="B", dtype=dtype) @@ -67,12 +73,17 @@ def matmul_add(N, L, M, dtype): return [A, B, C, out] -###################################################################### +################################################################################ # Create the search task -# ^^^^^^^^^^^^^^^^^^^^^^ -# We then create a search task with N=L=M=1024 and dtype="float32" -# If your machine supports avx instructions, you can +# ---------------------- +# With the function defined, we can now create the task for the auto_scheduler +# to search against. We specify the particular parameters for this matrix +# multiplication, in this case a multiplication of to square matricies of size +# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32" # +# .. note:: Improve performance with custom targets +# In order for TVM to take full advantage of specific hardware platforms, +# you will want to manuall specify your CPU capabilities. For example: # - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 # - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 @@ -84,15 +95,18 @@ def matmul_add(N, L, M, dtype): print("Computational DAG:") print(task.compute_dag) -###################################################################### +################################################################################ +# Set Parameters for Auto-Scheduler +# --------------------------------- # Next, we set parameters for the auto-scheduler. # -# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. -# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a -# good value for the search to converge. You can do more trials according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`. -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. +# * :code:`num_measure_trials` is the number of measurement trials we can use +# during the search. We only make 10 trials in this tutorial for a fast +# demonstration. In practice, 1000 is a good value for the search to converge. +# You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to log measurement records into a +# file `matmul.json`. The measurement records can be used to query the history +# best, resume the search, and do more analyses later. # * see :any:`auto_scheduler.TuningOptions` for more parameters log_file = "matmul.json" @@ -102,30 +116,32 @@ def matmul_add(N, L, M, dtype): verbose=2, ) -###################################################################### +################################################################################ # Run the search -# ^^^^^^^^^^^^^^ -# Now we get all inputs ready. Pretty simple, isn't it? -# We can kick off the search and let the auto-scheduler do its magic. -# After some measurement trials, we can load the best schedule from the log -# file and apply it. +# -------------- +# Now we get all inputs ready. Pretty simple, isn't it? We can kick off the +# search and let the auto-scheduler do its magic. After some measurement +# trials, we can load the best schedule from the log file and apply it. # Run auto-tuning (search) task.tune(tune_option) # Apply the best schedule sch, args = task.apply_best(log_file) -###################################################################### -# We can lower the schedule to see the IR after auto-scheduling. -# The auto-scheduler correctly performs optimizations including multi-level tiling, -# layout transformation, parallelization, vectorization, unrolling, and operator fusion. +################################################################################ +# Inspecting the Optimized Schedule +# --------------------------------- +# We can lower the schedule to see the IR after auto-scheduling. The +# auto-scheduler correctly performs optimizations including multi-level tiling, +# layout transformation, parallelization, vectorization, unrolling, and +# operator fusion. print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True)) -###################################################################### +################################################################################ # Check correctness and evaluate performance -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ------------------------------------------ # We build the binary and check its correctness and performance. func = tvm.build(sch, args, target) @@ -152,26 +168,25 @@ def matmul_add(N, L, M, dtype): ) -###################################################################### +################################################################################ # Using the record file -# ^^^^^^^^^^^^^^^^^^^^^ -# During the search, all measurement records are dumped into the record -# file "matmul.json". The measurement records can be used to re-apply search results, -# resume the search, and perform other analyses. - -###################################################################### -# Here is an example where we load the best schedule from a file, -# and print the equivalent python schedule API. This can be used for -# debugging and learning the behavior of the auto-scheduler. +# --------------------- +# During the search, all measurement records are logged into the record file +# "matmul.json". The measurement records can be used to re-apply search +# results, resume the search, and perform other analyses. +# +# Here is an example where we load the best schedule from a file, and print the +# equivalent python schedule API. This can be used for debugging and learning +# the behavior of the auto-scheduler. print("Equivalent python schedule:") print(task.print_best(log_file)) -###################################################################### -# A more complicated example is to resume the search. -# In this case, we need to create the search policy and cost model by ourselves -# and resume the status of search policy and cost model with the log file. -# In the example below we resume the status and do more 5 trials. +################################################################################ +# A more complicated example is to resume the search. In this case, we need to +# create the search policy and cost model by ourselves and resume the status of +# search policy and cost model with the log file. In the example below we +# resume the status and do more 5 trials. def resume_search(task, log_file): @@ -188,3 +203,12 @@ def resume_search(task, log_file): resume_search(task, log_file) + +################################################################################ +# Final Notes and Summary +# ----------------------- +# In this tutorial, we have shown how to use the TVM Auto-Scheduler to +# automatically optimize a matrix multiplication, without the need to specify a +# search template. It ends a series of examples that starts from the Tensor +# Expression (TE) language that demonstrates how TVM can optimize computational +# operations. From fe25b9e7c5f9c95d211f63ae544a9532eb50b398 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Fri, 12 Mar 2021 18:03:53 +0000 Subject: [PATCH 15/84] [TVMC] Allow options on --target to contain dots. (#7651) * Allow tvmc compile --target options to accept dots * Adds testing for dot separator in quoted and unquoted values * Add an "unquoting" conditional so that quoted and unquoted strings look the same when parsed --- python/tvm/driver/tvmc/common.py | 7 ++++++- tests/python/driver/tvmc/test_tvmc_common.py | 21 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py index 71bf42ae1e5cb..c5cb5f29031f2 100644 --- a/python/tvm/driver/tvmc/common.py +++ b/python/tvm/driver/tvmc/common.py @@ -133,7 +133,7 @@ def tokenize_target(target): target_pattern = ( r"(\-{0,2}[\w\-]+\=?" - r"(?:[\w\+\-]+(?:,[\w\+\-])*|[\'][\w\+\-,\s]+[\']|[\"][\w\+\-,\s]+[\"])*|,)" + r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*|[\'][\w\+\-,\s\.]+[\']|[\"][\w\+\-,\s\.]+[\"])*|,)" ) return re.findall(target_pattern, target) @@ -223,6 +223,11 @@ def parse_target(target): else: opt = opt[1:] if opt.startswith("-") else opt opt_name, opt_value = opt.split("=", maxsplit=1) + + # remove quotes from the value: quotes are only parsed if they match, + # so it is safe to assume that if the string starts with quote, it ends + # with quote. + opt_value = opt_value[1:-1] if opt_value[0] in ('"', "'") else opt_value except ValueError: raise ValueError(f"Error when parsing '{opt}'") diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py index b272ceccea394..23ea4f46b2ff2 100644 --- a/tests/python/driver/tvmc/test_tvmc_common.py +++ b/tests/python/driver/tvmc/test_tvmc_common.py @@ -273,3 +273,24 @@ def test_parse_multiple_target_with_opts(): assert "myopt" in targets[0]["opts"] assert "value" == targets[0]["opts"]["myopt"] assert "llvm" == targets[1]["name"] + + +def test_parse_multiple_separators_on_target(): + targets = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar") + + assert len(targets) == 1 + assert "+v1.0x,+value,+bar" == targets[0]["opts"]["option1"] + + +def test_parse_single_quoted_multiple_separators_on_target(): + targets = tvmc.common.parse_target("foo -option1='+v1.0x,+value'") + + assert len(targets) == 1 + assert "+v1.0x,+value" == targets[0]["opts"]["option1"] + + +def test_parse_double_quoted_multiple_separators_on_target(): + targets = tvmc.common.parse_target('foo -option1="+v1.0x,+value"') + + assert len(targets) == 1 + assert "+v1.0x,+value" == targets[0]["opts"]["option1"] From 692da0a32bc4c6d6bdc5d219182a00d9e7945c68 Mon Sep 17 00:00:00 2001 From: YubinCao <61700216+YubinCao@users.noreply.github.com> Date: Mon, 15 Mar 2021 15:22:35 +0800 Subject: [PATCH 16/84] [docker] fixed ci-gpu docker environment path typo. (#7648) --- docker/Dockerfile.ci_gpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index ac76af6b0a1eb..a44677f5ce568 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -107,8 +107,8 @@ ENV PATH=/usr/local/nvidia/bin:${PATH} ENV PATH=/usr/local/cuda/bin:${PATH} ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH} -ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LIBRARY_PATH} -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH} ENV PATH=/node_modules/.bin:${PATH} From 3beec22264f56f734de7d14cd6382b96e83e280a Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Mon, 15 Mar 2021 08:32:33 -0700 Subject: [PATCH 17/84] Fix issue when group attribute isnt defined in convtranspose. (#7655) --- python/tvm/relay/frontend/onnx.py | 2 +- tests/python/frontend/onnx/test_forward.py | 68 +++++++++------------- 2 files changed, 30 insertions(+), 40 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index f31b8c927f8f1..4a0a1ed09b9fa 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -446,7 +446,7 @@ def _impl_v1(cls, inputs, attr, params): # get number of channels channels = infer_channels(inputs[1], True) attr["channels"] = channels - groups = attr.pop("group") + groups = attr.get("group", 1) attr["groups"] = groups # infer pads for auto_pad data = inputs[0] diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 1e1341640ea04..177bed66f466b 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -2489,42 +2489,27 @@ def verify_convtranspose_with_padding( dilations, auto_pad="NOTSET", unset_pad=False, + group=1, ): - if unset_pad: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - ) - elif padding is None: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - auto_pad=auto_pad, - ) - else: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - pads=padding, - ) + node = helper.make_node( + "ConvTranspose", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=kernel_shape, + # Default values for other attributes: + strides=strides, + dilations=dilations, + ) + if not unset_pad: + if padding is None: + pad_attr = helper.make_attribute("auto_pad", auto_pad) + else: + pad_attr = helper.make_attribute("pads", padding) + node.attribute.append(pad_attr) + + if group is not None: + group_attr = helper.make_attribute("group", group) + node.attribute.append(group_attr) graph = helper.make_graph( [node], @@ -2536,22 +2521,25 @@ def verify_convtranspose_with_padding( outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))], ) - model = helper.make_model(graph, producer_name="conv_test") + model = helper.make_model(graph, producer_name="convtranspose_pad_test") verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True) -def verify_convtranspose(x_shape, w_shape, y_shape, p): +def verify_convtranspose(x_shape, w_shape, y_shape, p, group=1): node = onnx.helper.make_node( "ConvTranspose", inputs=["x", "W"], outputs=["y"], strides=[3, 2], - group=1, kernel_shape=[3, 3], pads=p, ) + if group is not None: + group_attr = helper.make_attribute("group", group) + node.attribute.append(group_attr) + graph = helper.make_graph( [node], "verify_convtranspose_test", @@ -2562,7 +2550,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p): outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))], ) - model = helper.make_model(graph, producer_name="convtranspose_trest") + model = helper.make_model(graph, producer_name="convtranspose_test") verify_with_ort(model, [x_shape, w_shape], y_shape) @@ -2574,6 +2562,8 @@ def test_convtranspose(): # (1, 2, 7, 3) output tensor # [1, 2, 1, 2] list for pads verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2]) + # Test undefined groups. + verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2], group=None) def repeat(N, D): return tuple([N for _ in range(D)]) From 1878889d92bfe84c5ed9e1ca87f258fdc9826c4a Mon Sep 17 00:00:00 2001 From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com> Date: Mon, 15 Mar 2021 23:38:21 +0800 Subject: [PATCH 18/84] revert SET_LLVM flag (#7657) Co-authored-by: Lei Wang <34334180+NjtechPrinceling@users.noreply.github.com> --- cmake/config.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/config.cmake b/cmake/config.cmake index eee07c356ae6b..65859566a6640 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -113,7 +113,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF) # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. -set(USE_LLVM ON) +set(USE_LLVM OFF) #--------------------------------------------- # Contrib libraries From d373d25aa29038e6d7535fb620d5303a117010cb Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 16 Mar 2021 00:39:33 +0900 Subject: [PATCH 19/84] fix build break for android_rpc (#7664) --- apps/android_rpc/app/src/main/jni/tvm_runtime.h | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index 2005568c608c4..fb5993066448f 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -47,6 +47,7 @@ #include "../src/runtime/module.cc" #include "../src/runtime/ndarray.cc" #include "../src/runtime/object.cc" +#include "../src/runtime/profiling.cc" #include "../src/runtime/registry.cc" #include "../src/runtime/rpc/rpc_channel.cc" #include "../src/runtime/rpc/rpc_endpoint.cc" From 10f5d17a668a1bb8fbd021cadbe6d052a180f706 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Mon, 15 Mar 2021 15:41:31 +0000 Subject: [PATCH 20/84] [TVMC] Refactoring to document the --target regex and simplify test cases (#7654) * Adds comments to document the regex being used to parse the --target=value string * Concatenate test cases without reducing the number of asserts or number of actual tests --- python/tvm/driver/tvmc/common.py | 12 ++++++++- tests/python/driver/tvmc/test_tvmc_common.py | 26 ++++++++------------ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py index c5cb5f29031f2..fbd7bc8976833 100644 --- a/python/tvm/driver/tvmc/common.py +++ b/python/tvm/driver/tvmc/common.py @@ -131,9 +131,19 @@ def tokenize_target(target): a list of parsed tokens extracted from the target string """ + # Regex to tokenize the "--target" value. It is split into five parts + # to match with: + # 1. target and option names e.g. llvm, -mattr=, -mcpu= + # 2. option values, all together, without quotes e.g. -mattr=+foo,+opt + # 3. option values, when single quotes are used e.g. -mattr='+foo, +opt' + # 4. option values, when double quotes are used e.g. -mattr="+foo ,+opt" + # 5. commas that separate different targets e.g. "my-target, llvm" target_pattern = ( r"(\-{0,2}[\w\-]+\=?" - r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*|[\'][\w\+\-,\s\.]+[\']|[\"][\w\+\-,\s\.]+[\"])*|,)" + r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*" + r"|[\'][\w\+\-,\s\.]+[\']" + r"|[\"][\w\+\-,\s\.]+[\"])*" + r"|,)" ) return re.findall(target_pattern, target) diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py index 23ea4f46b2ff2..474649d8b1b3a 100644 --- a/tests/python/driver/tvmc/test_tvmc_common.py +++ b/tests/python/driver/tvmc/test_tvmc_common.py @@ -275,22 +275,16 @@ def test_parse_multiple_target_with_opts(): assert "llvm" == targets[1]["name"] -def test_parse_multiple_separators_on_target(): - targets = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar") - - assert len(targets) == 1 - assert "+v1.0x,+value,+bar" == targets[0]["opts"]["option1"] +def test_parse_quotes_and_separators_on_options(): + targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar") + targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'") + targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"') + assert len(targets_no_quote) == 1 + assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"] -def test_parse_single_quoted_multiple_separators_on_target(): - targets = tvmc.common.parse_target("foo -option1='+v1.0x,+value'") - - assert len(targets) == 1 - assert "+v1.0x,+value" == targets[0]["opts"]["option1"] + assert len(targets_single_quote) == 1 + assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"] - -def test_parse_double_quoted_multiple_separators_on_target(): - targets = tvmc.common.parse_target('foo -option1="+v1.0x,+value"') - - assert len(targets) == 1 - assert "+v1.0x,+value" == targets[0]["opts"]["option1"] + assert len(targets_double_quote) == 1 + assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"] From 5bd78b398f3658bc66bec66aea78e6aa5faf872f Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Mon, 15 Mar 2021 17:09:05 +0000 Subject: [PATCH 21/84] [TVMC] Fix to check whether a path passed to --target is strictly a file (#7663) * When we use file with --target, the validation in place was only checking whether it was a valid path. For the case in which the path is a directory, it causes a crash when tvmc then tries to open the path. * This fix moved the check to be strictly for files, not only a valid path --- python/tvm/driver/tvmc/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py index fbd7bc8976833..864c3a9bddb41 100644 --- a/python/tvm/driver/tvmc/common.py +++ b/python/tvm/driver/tvmc/common.py @@ -280,7 +280,7 @@ def target_from_cli(target): """ extra_targets = [] - if os.path.exists(target): + if os.path.isfile(target): with open(target) as target_file: logger.debug("target input is a path: %s", target) target = "".join(target_file.readlines()) From 67f83973bb8e586046df745c2b70fc0a85917504 Mon Sep 17 00:00:00 2001 From: Akira Maruoka Date: Tue, 16 Mar 2021 02:33:15 +0900 Subject: [PATCH 22/84] Fixed strided_slice size (#7659) Co-authored-by: Akira Maruoka --- python/tvm/relay/frontend/tflite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 1b593ad8dea32..d6f704703cae8 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -3093,7 +3093,7 @@ def convert_detection_postprocess(self, op): valid_count = ret[0] # keep only the top 'max_detections' rows ret = _op.strided_slice( - ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], anchor_boxes] + ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], 6] ) # the output needs some reshaping to match tflite ret = _op.split(ret, 6, axis=2) From 068fed94cf3468e3df510ac8a9aed635ed746804 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Mon, 15 Mar 2021 19:51:26 +0000 Subject: [PATCH 23/84] Remove pytest dependency in arm_compute_lib.py (#7556) * Add OpAttrContext class which allows to temporarily change an attribute of an operator Change-Id: I19b809a105ea8769e56bd89e028e090959a08728 * Replace TempOpAttr with OpAttrContext in arm_compute_lib.py Change-Id: I1c42dd6a29e765b06ce28192397016efeea2e82a --- .../tvm/relay/op/contrib/arm_compute_lib.py | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index 139f25fef4fd8..fabb639845b6b 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -18,11 +18,11 @@ """Arm Compute Library supported operators.""" import tvm +from tvm import relay from tvm._ffi import register_func from tvm.relay.expr import const from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name -from tvm.relay.testing.temp_op_attr import TempOpAttr from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr from .register import register_pattern_table @@ -111,9 +111,9 @@ def convert_conv(attrs, inputs, tinfos, desired_layouts): return convert_conv - with TempOpAttr( + with OpAttrContext( "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d) - ), TempOpAttr( + ), OpAttrContext( "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d) ): seq = tvm.transform.Sequential( @@ -481,3 +481,36 @@ def qnn_add(expr): return False return True + + +class OpAttrContext(object): + """ Temporarily changes the attr of an op. """ + + def __init__(self, op_name, attr_key, attr_value): + """Saves the required info for RAII pattern usage. + + Parameters + ---------- + op_name : str + The op name. + + attr_key : str + The attribute name. + + attr_value : object + The attribute value. + """ + self.op = relay.op.get(op_name) + self.attr_key = attr_key + self.attr_value = attr_value + + def __enter__(self): + self.older_attr = self.op.get_attr(self.attr_key) + self.op.reset_attr(self.attr_key) + self.op.set_attr(self.attr_key, self.attr_value) + return self + + def __exit__(self, ptype, value, trace): + self.op.reset_attr(self.attr_key) + if self.older_attr: + self.op.set_attr(self.attr_key, self.older_attr) From 7f969864d90ae3f57a9bad4ccf3eacd3c49e44d9 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 15 Mar 2021 18:16:42 -0700 Subject: [PATCH 24/84] [Relay][Pass] Simplify consecutive transpose/layout_transform (#7656) * [Relay][Pass] Simplify consecutive transpose/layout_transform * lint * fix * support negative * comment --- src/relay/op/make_op.h | 2 + src/relay/transforms/simplify_expr.cc | 94 +++++++++++++++++++ tests/python/relay/test_pass_simplify_expr.py | 58 ++++++++++++ 3 files changed, 154 insertions(+) diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index 79f7e135e29d4..36a5ec1c0e72c 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -75,6 +75,8 @@ Expr MakeSqueeze(Expr data, Array axis); Expr MakeStack(Expr data, int axis); +Expr MakeTranspose(Expr data, Array axes); + Expr MakeStridedSlice(Expr data, Array begin, Array end, Array strides, String slice_mode); diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index 74e48dc4bc547..3c8876ceccb5f 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -82,6 +82,99 @@ class SimplifyReshape : public SimplifyPattern { DFPattern x_; }; +/*! + * \brief SimplifyTranspose matches the pattern of consecutive transpose op, + * and merges or cancels them. + */ +class SimplifyTranspose : public SimplifyPattern { + public: + SimplifyTranspose() { + x_ = IsWildcard(); + auto trans1 = IsOp("transpose") || IsOp("layout_transform"); + auto trans2 = IsOp("transpose") || IsOp("layout_transform"); + pattern_ = trans1({trans2({x_})}); + } + + Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + // Helper function to get the axes from call node attribute + auto get_axes_from_call = [](const Call trans_call, int ndim) { + std::vector attr_axes; + if (auto attr = trans_call->attrs.as()) { + if (attr->axes.defined()) { + for (int i = 0; i < ndim; ++i) { + int64_t axis = attr->axes[i]; + axis += (axis < 0) ? ndim : 0; + attr_axes.push_back(axis); + } + } else { + // Empty axes means reverse + for (int i = ndim - 1; i >= 0; --i) { + attr_axes.push_back(i); + } + } + } else if (auto attr = trans_call->attrs.as()) { + Layout src_layout(attr->src_layout); + Layout dst_layout(attr->dst_layout); + for (int i = 0; i < ndim; ++i) { + attr_axes.push_back(src_layout.IndexOf(dst_layout[i])); + } + } else { + CHECK(false) << "Expected transpose or layout_transform, but got " + << Downcast(trans_call->op)->name; + } + return std::move(attr_axes); + }; + + auto x = node_map[x_][0]; + + // Initialize axes + int ndim = Downcast(pre->checked_type())->shape.size(); + Array axes; + for (int i = 0; i < ndim; ++i) { + axes.push_back(i); + } + + // Collect axes changes from the matched pattern, including two consecutive transposes. + std::vector> interm_axes; + Call trans_call = Downcast(post); + interm_axes.push_back(get_axes_from_call(trans_call, ndim)); + trans_call = Downcast(trans_call->args[0]); + interm_axes.push_back(get_axes_from_call(trans_call, ndim)); + + // Calculate the final axes in reverse order (from root to output) + auto it = interm_axes.rbegin(); + while (it != interm_axes.rend()) { + auto interm = *it; + + Array new_axes; + for (int i = 0; i < ndim; ++i) { + new_axes.push_back(axes[interm[i]]); + } + axes = new_axes; + it++; + } + + // Check if the transpose is still required + bool need_transpose = false; + for (int i = 0; i < ndim; ++i) { + if (axes[i] != i) { + need_transpose = true; + break; + } + } + + if (need_transpose) { + return MakeTranspose(x, axes); + } + return x; + } + + private: + /*! \brief Pattern input */ + DFPattern x_; +}; + /*! * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op */ @@ -162,6 +255,7 @@ class ExprSimplifier { public: explicit ExprSimplifier(IRModule mod) : mod_(mod) { CreateCallback(SimplifyReshape()); + CreateCallback(SimplifyTranspose()); CreateCallback(FullElementwise()); } template diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py index 9531d896b2ed5..897f90b9ee2a2 100644 --- a/tests/python/relay/test_pass_simplify_expr.py +++ b/tests/python/relay/test_pass_simplify_expr.py @@ -60,6 +60,63 @@ def symbolic(): assert tvm.ir.structural_equal(zz, after) +def test_simplify_transpose(): + # Test a series of transpose and layout_transform ops + def before1(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.transpose(x, axes=[0, 2, 3, 1]) # To NHWC + y = relay.layout_transform(y, "NHWC", "HWCN") # To HWCN + y = relay.transpose(y, axes=[3, 0, 1, 2]) # To NHWC + return relay.Function([x], y) + + def expected1(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.transpose(x, axes=[0, 2, 3, 1]) # To NHWC + return relay.Function([x], y) + + # Test that all transpose ops can be cancelled + def before2(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y, axes=[0, 2, 3, 1]) # To NHWC + y = relay.transpose(y, axes=[1, 2, 3, 0]) # To HWCN + y = relay.transpose(y, axes=[3, 2, 0, 1]) # To NCHW + return relay.Function([x], y) + + def expected2(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + return relay.Function([x], y) + + # Test default axis (reverse) and negative axis + def before3(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y) # Reverse + y = relay.transpose(y) # Reverse + y = relay.transpose(y, axes=[0, 2, -1, 1]) + y = relay.transpose(y) # Reverse + y = relay.transpose(y) # Reverse + return relay.Function([x], y) + + def expected3(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y, axes=[0, 2, 3, 1]) + return relay.Function([x], y) + + for before, expected in [ + [before1(), expected1()], + [before2(), expected2()], + [before3(), expected3()], + ]: + after = run_opt_pass(before, transform.SimplifyExpr()) + expected = run_opt_pass(expected, transform.InferType()) + assert tvm.ir.structural_equal(after, expected), "\nafter: {} \nexpected: {}".format( + after, expected + ) + + def test_simplify_full_elementwise(): def validate(shape, value, dtype): def before_left(x, elem_op, full): @@ -126,4 +183,5 @@ def after_right(x, elem_op, value): if __name__ == "__main__": test_simplify_reshape() + test_simplify_transpose() test_simplify_full_elementwise() From 348d4e7532f7d5ce68d76e7634a5f11b7dc60ab8 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 15 Mar 2021 23:10:44 -0600 Subject: [PATCH 25/84] init the concat tensor with 1s and then slice them away (#7666) --- python/tvm/relay/frontend/onnx.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 4a0a1ed09b9fa..391eaaab5f64b 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -2629,10 +2629,10 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out): # Call the second loop, rework outputs into correct form init_count = _op.const(np.array([0]).astype("int64"), dtype="int64") - init_out = _op.const(np.array([]).reshape([0, 3]).astype("int64"), dtype="int64") + init_out = _op.const(np.array([1, 1, 1]).reshape([1, 3]).astype("int64"), dtype="int64") loop_vals = outer_loop(init_count, B, C, onnx_output, nms_size_output, init_out) - - return _expr.TupleGetItem(loop_vals, 5) + loop_out = _expr.TupleGetItem(loop_vals, 5) + return _op.strided_slice(loop_out, [1, 0], shape_of(loop_out), [1, 1]) # compatible operators that do NOT require any conversion. From d288bbc5df3660355adbf97f2f84ecd232e269ff Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 15 Mar 2021 23:12:16 -0600 Subject: [PATCH 26/84] [TOPI][GPU] Mergepath sort with odd-even block sort (#7611) * Mergepath sort with odd-even block sort * fix lint, add test * respond to review comments * speed up tests by reducing dtype skews * fix bad rebase * change threading to support vulkan * fix lint * only sort if the data is non-empty * fix lint again * fix for vk * move if to higher scope * fix typo Co-authored-by: Masahiro Masuda --- include/tvm/tir/stmt.h | 4 + python/tvm/topi/cuda/sort.py | 604 ++++++++++++++++++++------- src/tir/transforms/storage_access.cc | 4 + tests/python/relay/test_op_level6.py | 9 +- 4 files changed, 457 insertions(+), 164 deletions(-) diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index ac660bfb7461a..6445bb1fe73f7 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -1312,6 +1312,10 @@ constexpr const char* fragment_shape = "fragment_shape"; */ constexpr const char* fragment_layout = "fragment_layout"; +/*! + * \brief Mark that the kernel is hand threaded and doesn't need syncs inserted + */ +constexpr const char* hand_threaded = "hand_threaded"; /*! * \brief Check if attr_key is a pragma key extension * \param attr_key The attr key to be compared diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py index ca832ef0ef367..5ebd3060a6bbf 100644 --- a/python/tvm/topi/cuda/sort.py +++ b/python/tvm/topi/cuda/sort.py @@ -57,6 +57,20 @@ def traverse(op): return s +def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + + by = te.thread_axis("blockIdx.y") + bz = te.thread_axis("blockIdx.z") + ib.scope_attr(by, "thread_extent", nthread_by) + ib.scope_attr(bz, "thread_extent", nthread_bz) + + return tx, bx, by, bz + + def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None): """Initialize the output buffers by copying from inputs""" axis_mul_before = 1 @@ -78,16 +92,8 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f # Copy the keys_in to initial output with ib.new_scope(): - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) idx = (by * shape[axis] + tid) * axis_mul_after + bz with ib.if_scope(tid < shape[axis]): keys_out[idx] = keys_in[idx] @@ -97,6 +103,100 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f return axis_mul_before, axis_mul_after +## TODO(mbrookhart): These are effective optimziation hyperparametrs +## Perhaps we can autotune? +block_size = 128 +thread_work = 4 + + +def _odd_even_sort( + ib, + size, + axis_mul_before, + axis_mul_after, + is_ascend, + keys, + keys_swap, + values=None, + values_swap=None, +): + + nthread_tx = block_size // 2 + nthread_bx = ceil_div(size, block_size) + nthread_by = axis_mul_before + nthread_bz = axis_mul_after + with ib.new_scope(): + ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) + tid = 2 * tx + start = bx * block_size + + ## Create shared memory as syncable thread scratch space + tmp_keys_swap = ib.allocate( + keys_swap.dtype, + (block_size,), + name="temp_keys_swap", + scope="shared", + ) + if values_swap is not None: + tmp_values_swap = ib.allocate( + values_swap.dtype, + (block_size,), + name="temp_values_swap", + scope="shared", + ) + + ## Create thread local data for swapping + temp_keys = ib.allocate(keys_swap.dtype, (1,), name="temp_keys", scope="local") + if values_swap is not None: + temp_values = ib.allocate(values_swap.dtype, (1,), name="temp_values", scope="local") + + temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local") + temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local") + # Copy data to scratch space + base_idx = by * size * axis_mul_after + bz + with ib.for_range(0, 2) as n: + with ib.if_scope((tid + n + start) < size): + tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after] + if values_swap is not None: + tmp_values_swap[tid + n] = values[base_idx + (tid + n + start) * axis_mul_after] + + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + idxm = tvm.tir.indexmod + # OddEvenTransposeSort + current_sort_num = tvm.tir.min(block_size, size - start) + with ib.for_range(0, current_sort_num) as k: + n = idxm(tid + k, 2) + with ib.if_scope(tid + n < current_sort_num - 1): + temp_cond1[0] = tmp_keys_swap[tid + n] + temp_cond2[0] = tmp_keys_swap[tid + n + 1] + if is_ascend: + cond = temp_cond1[0] > temp_cond2[0] + else: + cond = temp_cond1[0] < temp_cond2[0] + with ib.if_scope(cond): + temp_keys[0] = tmp_keys_swap[tid + n] + tmp_keys_swap[tid + n] = tmp_keys_swap[tid + n + 1] + tmp_keys_swap[tid + n + 1] = temp_keys[0] + if values_swap is not None: + temp_values[0] = tmp_values_swap[tid + n] + tmp_values_swap[tid + n] = tmp_values_swap[tid + n + 1] + tmp_values_swap[tid + n + 1] = temp_values[0] + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + ## Copy sorted data to output + with ib.for_range(0, 2) as n: + with ib.if_scope(tid + n + start < size): + keys[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + keys_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + if values_swap is not None: + values[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[tid + n] + values_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[ + tid + n + ] + + def _sort_common( ib, size, @@ -110,22 +210,22 @@ def _sort_common( ): """Either sort only values or sort values by keys.""" - ## we are looping over the array doing mergesort from the bottom up. - ## The outer loop runs on the host and launches a cuda kernel for each iteration - ## of the algorithm. - ## The basic idea is that at iteration 0, each thread does sort on 2 elements. - ## On iteration 1, each thread merges 2 sorted arrays of 2 elements, - ## to deal with 4 total elements. - ## On iteration 2, each thread merges 2 sorted arrays of 4 elements, - ## to deal with 8 total elements. On iteration 3, each thread deals with 16 elements, etc - ## On the final iteration of the algorithm, one thread will merge two sorted lists - ## to sort the entire array + ## This function performs a multi-level mergesort + ## For blocks of length <= block_size, it does odd-even transpose sort + ## in GPU shared memory + ## For intermediate block sizes (>block_size, < max_threads * thread_work) + ## it uses the mergpath algorthim https://arxiv.org/abs/1406.2628 + ## to merge blocks in parallel + ## At some point, the size of the blocks to be merged is too big for max_threads + ## and we switch to using a dual-level mergepath where the outer mergepath + ## finds the start/end locations of the inner mergepath so that we can split + ## the merge into more blocks max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_by = axis_mul_before * axis_mul_after + nthread_bz = 1 nthread_tx = max_threads - nthread_bx = ceil_div(size, max_threads) - nthread_by = axis_mul_before - nthread_bz = axis_mul_after + nthread_bx = ceil_div(size, nthread_tx) def compare(a, b): """ @@ -137,91 +237,234 @@ def compare(a, b): out = b <= a return out - def bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even): - """ - Merge the two sections of the array assigned to this thread - """ - # pylint: disable=arguments-out-of-order - # initialize iterators + # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs + lower_lim = tvm.tir.generic.cast( + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float64"))), "int64" + ) + + _odd_even_sort( + ib, + size, + axis_mul_before * axis_mul_after, + 1, + is_ascend, + keys, + keys_swap, + values, + values_swap, + ) + + upper_lim = tvm.tir.generic.cast( + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64" + ) + + def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count): + first = ib.allocate("int64", (1,), name="first", scope="local") + mid = ib.allocate("int64", (1,), name="mid", scope="local") + last = ib.allocate("int64", (1,), name="last", scope="local") + first[0] = tvm.te.max(0, diag - bCount) + last[0] = tvm.te.min(diag, aCount) + with ib.while_loop(first[0] < last[0]): + mid = (first[0] + last[0]) >> 1 + a = source[base_idx + (aStart + mid)] + b = source[base_idx + (bStart + diag - 1 - mid)] + with ib.if_scope(compare(a, b)): + first[0] = mid + 1 + with ib.else_scope(): + last[0] = mid + return first[0], last[0] + + def serial_merge( + source, + dest, + source_idx, + dest_idx, + base_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ): i = ib.allocate("int64", (1,), name="i", scope="local") j = ib.allocate("int64", (1,), name="j", scope="local") - i[0] = start - j[0] = middle - # set up indexes - base_idx = by * size * axis_mul_after + bz - # iterate over the output loop - with ib.for_range(0, end - start) as k: - i_idx = base_idx + i[0] * axis_mul_after - j_idx = base_idx + j[0] * axis_mul_after - k_idx = base_idx + (k + start) * axis_mul_after - - def swap_values(source, dest, source_idx, dest_idx): - def assign_i(): - """assign i value to current output""" - dest[k_idx] = source[i_idx] - if values is not None: - dest_idx[k_idx] = source_idx[i_idx] - i[0] += 1 - - def assign_j(): - """assign j value to current output""" - dest[k_idx] = source[j_idx] - if values is not None: - dest_idx[k_idx] = source_idx[j_idx] - j[0] += 1 - - ## if both of the iterators are in range - with ib.if_scope(tvm.tir.all(i[0] < middle, j[0] < end)): - # compare them and insert whichever is next into the output - with ib.if_scope(compare(source[i_idx], source[j_idx])): - assign_i() - with ib.else_scope(): - assign_j() - # otherwise, simply copy the remainder of the valid iterator to the output - with ib.else_scope(): - with ib.if_scope(i[0] < middle): - assign_i() - with ib.else_scope(): - assign_j() + i[0] = aStart + first + j[0] = bStart + diag - last + with ib.for_range(0, tvm.te.min(aCount + bCount - diag, step_count)) as count: + i_idx = base_idx + i[0] + j_idx = base_idx + j[0] + k_idx = base_idx + (kStart + diag + count) + + def assign_i(): + """assign i value to current output""" + dest[k_idx] = source[i_idx] + if values is not None: + dest_idx[k_idx] = source_idx[i_idx] + i[0] += 1 - # Switch which input is the source and which is the destination each iteration - with ib.if_scope(even): - swap_values(source, dest, source_idx, dest_idx) + def assign_j(): + """assign j value to current output""" + dest[k_idx] = source[j_idx] + if values is not None: + dest_idx[k_idx] = source_idx[j_idx] + j[0] += 1 + + ## if both of the iterators are in range + with ib.if_scope(tvm.tir.all(i[0] < aStart + aCount, j[0] < bStart + bCount)): + # compare them and insert whichever is next into the output + with ib.if_scope(compare(source[i_idx], source[j_idx])): + assign_i() + with ib.else_scope(): + assign_j() + # otherwise, simply copy the remainder of the valid iterator to the output with ib.else_scope(): - swap_values(dest, source, dest_idx, source_idx) - - def mergesort(source, dest, source_idx, dest_idx, size, width, even): - # calculate the start, mid, and end points of this section - start = width * tid - - with ib.if_scope(start < size): - middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64") - end = cast(tvm.te.min(start + width, size), "int64") - # merge the start->middle and middle->end arrays - bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even) + with ib.if_scope(i[0] < aStart + aCount): + assign_i() + with ib.else_scope(): + assign_j() - lim = tvm.tir.generic.cast( - tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64" - ) - with ib.for_range(0, lim, dtype="int64") as l2_width: - width = 2 << l2_width + with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width: + width = 2 << (l2_width + lower_lim) # Define and launch the cuda kernel with ib.new_scope(): - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - # Reduce the number of blocks as the work per thread grows - ib.scope_attr( - bx, - "thread_extent", - tvm.tir.generic.cast(ceil_div(size, width * max_threads), "int32"), - ) - tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) + target = tvm.target.Target.current() + if "vulkan" in str(target): + # Vulkan can't handle dynamic nthread, so we thread slightly differently + # for vulkan. We don't do this generally because it causes a 15% perf + # regression on other platforms + ntx = max_threads + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz) + else: + ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32") + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz) + + def mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + step_count, + even, + ): + # pylint: disable=arguments-out-of-order + def merge(source, dest, source_idx, dest_idx): + diag = tx * step_count + first, last = get_merge_begin( + source, + by * size, + aCount, + bCount, + aStart, + bStart, + diag, + step_count, + ) + # iterate over the output loop + serial_merge( + source, + dest, + source_idx, + dest_idx, + by * size, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ) + + with ib.if_scope(even): + merge(source, dest, source_idx, dest_idx) + with ib.else_scope(): + merge(dest, source, dest_idx, source_idx) + + def mergesort(source, dest, source_idx, dest_idx, size, width, even): + # calculate the start, mid, and end points of this section + start = width * bz + middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64") + end = cast(tvm.te.min(start + width, size), "int64") + with ib.if_scope(start < size): + with ib.if_scope(nbx == 1): + ## merge the start->middle and middle->end arrays + aCount = middle - start + bCount = end - middle + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + start, + middle, + start, + ceil_div(width, ntx), + even, + ) + with ib.else_scope(): + step_count = max_threads * thread_work + diag = bx * step_count + + def do_merge(first, last): + aStart = start + first + bStart = middle + diag - last + aCount = tvm.te.min(middle - aStart, step_count) + bCount = tvm.te.min(end - bStart, step_count) + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + start + diag, + thread_work, + even, + ) + + with ib.if_scope(even): + first, last = get_merge_begin( + source, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) + with ib.else_scope(): + first, last = get_merge_begin( + dest, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) # Call the kernel mergesort( @@ -233,29 +476,23 @@ def mergesort(source, dest, source_idx, dest_idx, size, width, even): width, tvm.tir.indexmod(l2_width, 2) == 0, ) - + nthread_by = axis_mul_before + nthread_bz = axis_mul_after + nthread_tx = max_threads + nthread_bx = ceil_div(size, nthread_tx) ## if the final sorted data ended up in the swap, copy it to the real output - with ib.if_scope(tvm.tir.indexmod(lim, 2) == 1): + with ib.if_scope( + tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1) + ): with ib.new_scope(): - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) - idx = (by * size + tid) * axis_mul_after + bz + idx = (by * axis_mul_after + bz) * size + tid with ib.if_scope(tid < size): - idx = (by * size + tid) * axis_mul_after + bz keys[idx] = keys_swap[idx] if values is not None: values[idx] = values_swap[idx] - return ib.get() - def sort_ir( data, values_out, values_out_swap, axis, is_ascend, indices_out=None, indices_out_swap=None @@ -301,27 +538,30 @@ def sort_ir( assert indices_out_swap is not None indices_out_swap = ib.buffer_ptr(indices_out_swap) - axis_mul_before, axis_mul_after = _sort_init( - ib, - shape, - axis, - data, - values_out, - indices_out, - value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype), - ) + with ib.if_scope(shape[axis] > 0): + axis_mul_before, axis_mul_after = _sort_init( + ib, + shape, + axis, + data, + values_out, + indices_out, + value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype), + ) + + _sort_common( + ib, + shape[axis], + axis_mul_before, + axis_mul_after, + is_ascend, + values_out, + values_out_swap, + values=indices_out, + values_swap=indices_out_swap, + ) - return _sort_common( - ib, - shape[axis], - axis_mul_before, - axis_mul_after, - is_ascend, - values_out, - values_out_swap, - values=indices_out, - values_swap=indices_out_swap, - ) + return ib.get() def sort_by_key_ir( @@ -376,27 +616,29 @@ def sort_by_key_ir( values_out = ib.buffer_ptr(values_out) values_out_swap = ib.buffer_ptr(values_out_swap) - axis_mul_before, axis_mul_after = _sort_init( - ib, - shape, - axis, - keys_in, - keys_out, - values_out, - value_init_func=lambda idx, _: values_in[idx], - ) - - return _sort_common( - ib, - shape[axis], - axis_mul_before, - axis_mul_after, - is_ascend, - keys_out, - keys_out_swap, - values=values_out, - values_swap=values_out_swap, - ) + with ib.if_scope(shape[axis] > 0): + axis_mul_before, axis_mul_after = _sort_init( + ib, + shape, + axis, + keys_in, + keys_out, + values_out, + value_init_func=lambda idx, _: values_in[idx], + ) + + _sort_common( + ib, + shape[axis], + axis_mul_before, + axis_mul_after, + is_ascend, + keys_out, + keys_out_swap, + values=values_out, + values_swap=values_out_swap, + ) + return ib.get() def sort(data, axis=-1, is_ascend=1): @@ -419,16 +661,29 @@ def sort(data, axis=-1, is_ascend=1): out : tvm.te.Tensor The output of this function. """ + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8) + out = te.extern( [data.shape, data.shape], [data], - lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend), + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), out_buffers=[value_buf, value_buf_swap], name="sort_gpu", tag="sort_gpu", )[0] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + return out @@ -507,10 +762,18 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): out : tvm.te.Tensor The output of this function. """ + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) value_swap_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_swap_buf", data_alignment=8) indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_swap_buf", data_alignment=8) + out = te.extern( [data.shape, data.shape, data.shape, data.shape], [data], @@ -518,7 +781,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): ins[0], outs[0], outs[2], - axis, + -1, is_ascend, indices_out=outs[1], indices_out_swap=outs[3], @@ -527,6 +790,11 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): name="argsort_gpu", tag="argsort_gpu", )[1] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + return out @@ -625,21 +893,30 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): ndim = len(data.shape) axis = axis + ndim if axis < 0 else axis assert 0 <= axis < ndim + dshape = data.shape + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8) values_swap_buf = tvm.tir.decl_buffer( data.shape, data.dtype, "values_swap_buf", data_alignment=8 ) indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8) indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "indies_swap_buf", data_alignment=8) + if ret_type == "values": output = te.extern( [data.shape, data.shape], [data], - lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend), + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), out_buffers=[values_buf, values_swap_buf], name="topk_gpu", tag="topk_gpu", )[0] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) else: output = te.extern( [data.shape, data.shape, data.shape, data.shape], @@ -648,7 +925,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): ins[0], outs[0], outs[2], - axis, + -1, is_ascend, indices_out=outs[1], indices_out_swap=outs[3], @@ -657,6 +934,11 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): name="topk_gpu", tag="topk_gpu", )[0:2] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output[0] = transpose(output[0], axes) + output[1] = transpose(output[1], axes) + if isinstance(k, int) and k < 1: if ret_type == "indices": return output[1] @@ -668,7 +950,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): if i == axis: end.append(k if isinstance(k, int) else tvm.te.size_var("dim")) else: - end.append(data.shape[i]) + end.append(dshape[i]) if ret_type == "both": values_out, indices_out = output values_out = strided_slice(values_out, beg, end, strides) diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc index 38143c14b021d..00002d3587dbe 100644 --- a/src/tir/transforms/storage_access.cc +++ b/src/tir/transforms/storage_access.cc @@ -132,6 +132,10 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) { StmtExprVisitor::VisitStmt_(op); } env_threads_.pop_back(); + } else if (op->attr_key == attr::hand_threaded) { + // skip this pass on blocks that were hand_threaded + // this avoids control flow and read/write conflicts + // between hand-threaded kernels and automatic threading } else { StmtExprVisitor::VisitStmt_(op); } diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py index 0dac69e360258..f4b785f59df84 100644 --- a/tests/python/relay/test_op_level6.py +++ b/tests/python/relay/test_op_level6.py @@ -26,6 +26,7 @@ @tvm.testing.uses_gpu def test_sort(): def verify_sort(shape, axis, is_ascend, is_dyn=False): + if is_dyn: x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32")) else: @@ -87,9 +88,11 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False): for dtype in ["int32", "int64", "float32", "float64"]: verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn) verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn) - verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) - verify_argsort((3, 2000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) - verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + dtype = "int32" + verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((3, 6000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((1000, 1, 1), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) @tvm.testing.uses_gpu From 343b689bf894c72fbda5d8ef90d6b27b5a2613fd Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Tue, 16 Mar 2021 12:19:38 -0700 Subject: [PATCH 27/84] [docs] Getting Started with TVM: TVMC Tutorial (#7640) * Getting Started with TVM: TVMC Tutorial An update of the TVMC tutorial, follows the introduction and installation sections of the new getting started tutorial * Update tutorials/get_started/tvmc_command_line_driver.py Co-authored-by: Leandro Nunes * Style and formatting fixes Co-authored-by: Leandro Nunes --- .../get_started/tvmc_command_line_driver.py | 552 +++++++++++------- 1 file changed, 357 insertions(+), 195 deletions(-) diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py index bcdf03e568750..fffbfbf0356fd 100644 --- a/tutorials/get_started/tvmc_command_line_driver.py +++ b/tutorials/get_started/tvmc_command_line_driver.py @@ -15,31 +15,33 @@ # specific language governing permissions and limitations # under the License. """ -Getting Started with TVM command line driver - TVMC -=================================================== +Compiling and Optimizing a Model with TVMC +========================================== **Authors**: `Leandro Nunes `_, -`Matthew Barrett `_ - -This tutorial is an introduction to working with TVMC, the TVM command -line driver. TVMC is a tool that exposes TVM features such as -auto-tuning, compiling, profiling and execution of models, via a -command line interface. - -In this tutorial we are going to use TVMC to compile, run and tune a -ResNet-50 on a x86 CPU. - -We are going to start by downloading ResNet 50 V2. Then, we are going -to use TVMC to compile this model into a TVM module, and use the -compiled module to generate predictions. Finally, we are going to experiment -with the auto-tuning options, that can be used to help the compiler to -improve network performance. - -The final goal is to give an overview of TVMC's capabilities and also -some guidance on where to look for more information. +`Matthew Barrett `_, +`Chris Hoge `_ + +In this section, we will work with TVMC, the TVM command line driver. TVMC is a +tool that exposes TVM features such as auto-tuning, compiling, profiling and +execution of models through a command line interface. + +Upon completion of this section, we will have used TVMC to accomplish the +following tasks: + +* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. +* Run a real image through the compiled model, and interpret the output and + model performance. +* Tune the model on a CPU using TVM. +* Re-compile an optimized model using the tuning data collected by TVM. +* Run the image through the optimized model, and compare the output and model + performance. + +The goal of this section is to give you an overview of TVM and TVMC's +capabilities, and set the stage for understanding how TVM works. """ -###################################################################### +################################################################################ # Using TVMC # ---------- # @@ -61,32 +63,35 @@ # # tvmc --help # -# -# As you can see in the help page, the main features are -# accessible via the subcommands ``tune``, ``compile`` and ``run``. -# To read about specific options under a given subcommand, use -# ``tvmc --help``. -# -# In the following sections we will use TVMC to tune, compile and -# run a model. But first, we need a model. +# The main features of TVM available to ``tvmc`` are from subcommands +# ``compile``, and ``run``, and ``tune``. To read about specific options under +# a given subcommand, use ``tvmc --help``. We will cover each of +# these commands in this tutorial, but first we need to download a pre-trained +# model to work with. # -###################################################################### -# Obtaining the model +################################################################################ +# Obtaining the Model # ------------------- # -# We are going to use ResNet-50 V2 as an example to experiment with TVMC. -# The version below is in ONNX format. To download the file, you can use -# the command below: +# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a +# convolutional neural network that is 50-layers deep and designed to classify +# images. The model we will be using has been pre-trained on more than a +# million images with 1000 different classifications. The network has an input +# image size of 224x224. If you are interested exploring more of how the +# ResNet-50 model is structured, we recommend downloading `Netron +# `, a freely available ML model viewer. +# +# For this tutorial we will be using the model in ONNX format. # # .. code-block:: bash # # wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx # -# -###################################################################### + +################################################################################ # .. note:: Supported model formats # # TVMC supports models created with Keras, ONNX, TensorFlow, TFLite @@ -96,241 +101,398 @@ # -###################################################################### -# Compiling the model -# ------------------- +################################################################################ +# Compiling an ONNX Model to the TVM Runtime +# ------------------------------------------ # -# The next step once we've downloaded ResNet-50, is to compile it, -# To accomplish that, we are going to use ``tvmc compile``. The -# output we get from the compilation process is a TAR package, -# that can be used to run our model on the target device. +# Once we've downloaded the ResNet-50 model, the next step is to compile it. To +# accomplish that, we are going to use ``tvmc compile``. The output we get from +# the compilation process is a TAR package of the model compiled to a dynamic +# library for our target platform. We can run that model on our target device +# using the TVM runtime. # # .. code-block:: bash # # tvmc compile \ -# --target "llvm" \ -# --output compiled_module.tar \ -# resnet50-v2-7.onnx +# --target "llvm" \ +# --output resnet50-v2-7-tvm.tar \ +# resnet50-v2-7.onnx # -# Once compilation finishes, the output ``compiled_module.tar`` will be created. This -# can be directly loaded by your application and run via the TVM runtime APIs. +# Let's take a look at the files that ``tvmc compile`` creates in the module: # +# .. code-block:: bash +# +# mkdir model +# tar -xvf resnet50-v2-7-tvm.tar -C model +# ls model +# +# You will see three files listed. +# +# * ``mod.so`` is the model, represented as a C++ library, that can be loaded +# by the TVM runtime. +# * ``mod.json`` is a text representation of the TVM Relay computation graph. +# * ``mod.params`` is a file containing the parameters for the pre-trained +# model. +# +# This module can be directly loaded by your application, and the model can be +# run via the TVM runtime APIs. -###################################################################### -# .. note:: Defining the correct target +################################################################################ +# .. note:: Defining the Correct Target # # Specifying the correct target (option ``--target``) can have a huge # impact on the performance of the compiled module, as it can take # advantage of hardware features available on the target. For more # information, please refer to `Auto-tuning a convolutional network # for x86 CPU `_. +# We recommend identifying which CPU you are running, along with optional features, +# and set the target appropriately. # - -###################################################################### -# -# In the next step, we are going to use the compiled module, providing it -# with some inputs, to generate some predictions. -# - - -###################################################################### -# Input pre-processing -# -------------------- +################################################################################ +# Running the Model from The Compiled Module with TVMC +# ---------------------------------------------------- # -# In order to generate predictions, we will need two things: +# Now that we've compiled the model to this module, we can use the TVM runtime +# to make predictions with it. TVMC has the TVM runtime built in to it, +# allowing you to run compiled TVM models. To use TVMC to run the model and +# make predictions, we need two things: # -# - the compiled module, which we just produced; -# - a valid input to the model +# - The compiled module, which we just produced. +# - Valid input to the model to make predictions on. # -# Each model is particular when it comes to expected tensor shapes, formats and data -# types. For this reason, most models require some pre and -# post processing, to ensure the input(s) is valid and to interpret the output(s). +# Each model is particular when it comes to expected tensor shapes, formats and +# data types. For this reason, most models require some pre and +# post-processing, to ensure the input is valid and to interpret the output. +# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This +# is a well-supported NumPy format to serialize multiple arrays into a file # -# In TVMC, we adopted NumPy's ``.npz`` format for both input and output data. -# This is a well-supported NumPy format to serialize multiple arrays into a file. -# -# We will use the usual cat image, similar to other TVM tutorials: +# As input for this tutorial, we will use the image of a cat, but you can feel +# free to substitute image for any of your choosing. # # .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg # :height: 224px # :width: 224px # :align: center + + +################################################################################ +# Input pre-processing +# ~~~~~~~~~~~~~~~~~~~~ # # For our ResNet 50 V2 model, the input is expected to be in ImageNet format. # Here is an example of a script to pre-process an image for ResNet 50 V2. # -from tvm.contrib.download import download_testdata -from PIL import Image -import numpy as np - -img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -img_path = download_testdata(img_url, "imagenet_cat.png", module="data") - -# Resize it to 224x224 -resized_image = Image.open(img_path).resize((224, 224)) -img_data = np.asarray(resized_image).astype("float32") - -# ONNX expects NCHW input, so convert the array -img_data = np.transpose(img_data, (2, 0, 1)) - -# Normalize according to ImageNet -imagenet_mean = np.array([0.485, 0.456, 0.406]) -imagenet_stddev = np.array([0.229, 0.224, 0.225]) -norm_img_data = np.zeros(img_data.shape).astype("float32") -for i in range(img_data.shape[0]): - norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] - -# Add batch dimension -img_data = np.expand_dims(norm_img_data, axis=0) - -# Save to .npz (outputs imagenet_cat.npz) -np.savez("imagenet_cat", data=img_data) - +# .. code-block:: python +# :caption: preprocess.py +# :name: preprocess.py +# +# #!python ./preprocess.py +# from tvm.contrib.download import download_testdata +# from PIL import Image +# import numpy as np +# +# img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" +# img_path = download_testdata(img_url, "imagenet_cat.png", module="data") +# +# # Resize it to 224x224 +# resized_image = Image.open(img_path).resize((224, 224)) +# img_data = np.asarray(resized_image).astype("float32") +# +# # ONNX expects NCHW input, so convert the array +# img_data = np.transpose(img_data, (2, 0, 1)) +# +# # Normalize according to ImageNet +# imagenet_mean = np.array([0.485, 0.456, 0.406]) +# imagenet_stddev = np.array([0.229, 0.224, 0.225]) +# norm_img_data = np.zeros(img_data.shape).astype("float32") +# for i in range(img_data.shape[0]): +# norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] +# +# # Add batch dimension +# img_data = np.expand_dims(norm_img_data, axis=0) +# +# # Save to .npz (outputs imagenet_cat.npz) +# np.savez("imagenet_cat", data=img_data) +# -###################################################################### -# Running the compiled module -# --------------------------- +################################################################################ +# Running the Compiled Module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# With both the compiled module and input file in hand, we can run it by -# invoking ``tvmc run``. +# With both the model and input data in hand, we can now run TVMC to make a +# prediction: # # .. code-block:: bash # -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# compiled_module.tar +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm.tar # -# When running the above command, a new file ``predictions.npz`` should -# be produced. It contains the output tensors. +# Recall that the `.tar` model file includes a C++ library, a description of +# the Relay model, and the parameters for the model. TVMC includes the TVM +# runtime, which can load the model and make predictions against input. When +# running the above command, TVMC outputs a new file, ``predictions.npz``, that +# contains the model output tensors in NumPy format. # # In this example, we are running the model on the same machine that we used -# for compilation. In some cases we might want to run it remotely via -# an RPC Tracker. To read more about these options please check ``tvmc -# run --help``. -# +# for compilation. In some cases we might want to run it remotely via an RPC +# Tracker. To read more about these options please check ``tvmc run --help``. -###################################################################### -# Output post-processing -# ---------------------- +################################################################################ +# Output Post-Processing +# ~~~~~~~~~~~~~~~~~~~~~~ # -# As previously mentioned, each model will have its own particular way -# of providing output tensors. +# As previously mentioned, each model will have its own particular way of +# providing output tensors. # -# In our case, we need to run some post-processing to render the -# outputs from ResNet 50 V2 into a more human-readable form. +# In our case, we need to run some post-processing to render the outputs from +# ResNet 50 V2 into a more human-readable form, using the lookup-table provided +# for the model. # -# The script below shows an example of the post-processing to extract -# labels from the output of our compiled module. +# The script below shows an example of the post-processing to extract labels +# from the output of our compiled module. # -import os.path -import numpy as np - -from scipy.special import softmax - -from tvm.contrib.download import download_testdata - -# Download a list of labels -labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -labels_path = download_testdata(labels_url, "synset.txt", module="data") - -with open(labels_path, "r") as f: - labels = [l.rstrip() for l in f] - -output_file = "predictions.npz" - -# Open the output and read the output tensor -if os.path.exists(output_file): - with np.load(output_file) as data: - scores = softmax(data["output_0"]) - scores = np.squeeze(scores) - ranks = np.argsort(scores)[::-1] - - for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - - -######################################################################## -# When running the script, a list of predictions should be printed similar -# the the example below. +# .. code-block:: python +# :caption: postprocess.py +# :name: postprocess.py +# +# #!python ./postprocess.py +# import os.path +# import numpy as np +# +# from scipy.special import softmax +# +# from tvm.contrib.download import download_testdata +# +# # Download a list of labels +# labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" +# labels_path = download_testdata(labels_url, "synset.txt", module="data") +# +# with open(labels_path, "r") as f: +# labels = [l.rstrip() for l in f] +# +# output_file = "predictions.npz" +# +# # Open the output and read the output tensor +# if os.path.exists(output_file): +# with np.load(output_file) as data: +# scores = softmax(data["output_0"]) +# scores = np.squeeze(scores) +# ranks = np.argsort(scores)[::-1] +# +# for rank in ranks[0:5]: +# print("class='%s' with probability=%f" % (labels[rank], scores[rank])) +# +# Running this script should produce the following output: # # .. code-block:: bash # -# $ python post_processing.py -# class=n02123045 tabby, tabby cat ; probability=446.000000 -# class=n02123159 tiger cat ; probability=675.000000 -# class=n02124075 Egyptian cat ; probability=836.000000 -# class=n02129604 tiger, Panthera tigris ; probability=917.000000 -# class=n04040759 radiator ; probability=213.000000 +# python postprocess.py # +# # class='n02123045 tabby, tabby cat' with probability=0.610553 +# # class='n02123159 tiger cat' with probability=0.367179 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 +# +# Try replacing the cat image with other images, and see what sort of +# predictions the ResNet model makes. - -###################################################################### -# Tuning the model -# ---------------- +################################################################################ +# Automatically Tuning the ResNet Model +# ------------------------------------- +# +# The previous model was compiled to work on the TVM runtime, but did not +# include any platform specific optimization. In this section, we will show you +# how to build an optimized model using TVMC to target your working platform. # # In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use -# of the auto-tuner, to find a better configuration for our model and -# get a boost in performance. -# -# Tuning in TVM refers to the process by which a model is optimized -# to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, -# but only the runtime performance. -# -# As part of the tuning process, TVM will try running many different -# operator implementation variants to see which perform best. The -# results of these runs are stored in a tuning records file, which is +# inferences using our compiled module. In cases like this, we can make use of +# the auto-tuner, to find a better configuration for our model and get a boost +# in performance. Tuning in TVM refers to the process by which a model is +# optimized to run faster on a given target. This differs from training or +# fine-tuning in that it does not affect the accuracy of the model, but only +# the runtime performance. As part of the tuning process, TVM will try running +# many different operator implementation variants to see which perform best. +# The results of these runs are stored in a tuning records file, which is # ultimately the output of the ``tune`` subcommand. # # In the simplest form, tuning requires you to provide three things: # -# - the target specification of the device you intend to run this model on; -# - the path to an output file in which the tuning records will be stored, and finally, +# - the target specification of the device you intend to run this model on +# - the path to an output file in which the tuning records will be stored, and +# finally # - a path to the model to be tuned. # -# # The example below demonstrates how that works in practice: # # .. code-block:: bash # -# tvmc tune \ +# tvmc tune \ # --target "llvm" \ -# --output autotuner_records.json \ +# --output resnet50-v2-7-autotuner_records.json \ # resnet50-v2-7.onnx # +# In this example, you will see better results if you indicate a more specific +# target for the `--target` flag. For example, on an Intel i7 processor you +# could use `--target llvm -mcpu=skylake`. For this tuning example, we are +# tuning locally on the CPU using LLVM as the compiler for the specified +# achitecture. +# +# TVMC will perform a search against the parameter space for the model, trying +# out different configurations for operators and choosing the one that runs +# fastest on your platform. Although this is a guided search based on the CPU +# and model operations, it can still take several hours to complete the search. +# The output of this search will be saved to the +# `resnet50-v2-7-autotuner_records.json` file, which will later be used to +# compile an optimized model. +# +# .. note:: Defining the Tuning Search Algorithm +# +# By default this search is guided using an `XGBoost Grid` algorithm. +# Depending on your model complexity and amount of time avilable, you might +# want to choose a different algorithm. A full list is available by +# consulting ``tvmc tune --help``. +# +# The output will look something like this for a consumer-level Skylake CPU: +# +# .. code-block:: bash +# +# tvmc tune --target "llvm -mcpu=broadwell" --output resnet50-v2-7-autotuner_records.json resnet50-v2-7.onnx +# # [Task 1/24] Current/Best: 9.65/ 23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done. +# # [Task 1/24] Current/Best: 3.56/ 23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done. +# # [Task 2/24] Current/Best: 13.13/ 58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done. +# # [Task 3/24] Current/Best: 31.93/ 59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done. +# # [Task 4/24] Current/Best: 16.42/ 57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done. +# # [Task 5/24] Current/Best: 12.42/ 57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done. +# # [Task 6/24] Current/Best: 20.66/ 59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done. +# # [Task 7/24] Current/Best: 15.48/ 59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done. +# # [Task 8/24] Current/Best: 31.97/ 59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done. +# # [Task 9/24] Current/Best: 34.14/ 60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done. +# # [Task 10/24] Current/Best: 12.53/ 58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done. +# # [Task 11/24] Current/Best: 30.94/ 58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done. +# # [Task 12/24] Current/Best: 23.66/ 58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done. +# # [Task 13/24] Current/Best: 25.44/ 59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done. +# # [Task 14/24] Current/Best: 26.83/ 58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done. +# # [Task 15/24] Current/Best: 33.64/ 58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done. +# # [Task 16/24] Current/Best: 14.93/ 57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done. +# # [Task 17/24] Current/Best: 28.70/ 58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done. +# # [Task 18/24] Current/Best: 19.01/ 60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done. +# # [Task 19/24] Current/Best: 14.61/ 57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done. +# # [Task 20/24] Current/Best: 10.47/ 57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done. +# # [Task 21/24] Current/Best: 34.37/ 58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done. +# # [Task 22/24] Current/Best: 15.75/ 57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done. +# # [Task 23/24] Current/Best: 23.23/ 58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done. +# # [Task 24/24] Current/Best: 17.27/ 55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done. +# +# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning +# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning +# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information. +# + +################################################################################ +# Compiling an Optimized Model with Tuning Data +# ---------------------------------------------- +# +# As an output of the tuning process above, we obtained the tuning records +# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in +# two ways: +# +# - As input to further tuning (via ``tvmc tune --tuning-records``). +# - As input to the compiler +# +# The compiler will use the results to generate high performance code for the +# model on your specified target. To do that we can use ``tvmc compile +# --tuning-records``. Check ``tvmc compile --help`` for more information. +# +# Now that tuning data for the model has been collected, we can re-compile the +# model using optimized operators to speed up our computations. +# +# .. code-block:: bash +# +# tvmc compile \ +# --target "llvm" \ +# --tuning-records resnet50-v2-7-autotuner_records.json \ +# --output resnet50-v2-7-tvm_autotuned.tar \ +# resnet50-v2-7.onnx +# +# Verify that the optimized model runs and produces the same results: +# +# .. code-block:: bash +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm_autotuned.tar +# +# python postproccess.py +# +# Verifying that the predictions are the same: +# +# .. code-block:: bash +# +# # class='n02123045 tabby, tabby cat' with probability=0.610550 +# # class='n02123159 tiger cat' with probability=0.367181 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 + +################################################################################ +# Comparing the Tuned and Untuned Models +# -------------------------------------- +# +# TVMC gives you tools for basic performance benchmarking between the models. +# You can specify a number of repetitions and that TVMC report on the model run +# time (independent of runtime startup). We can get a rough idea of how much +# tuning has improved the model performance. For example, on a test Intel i7 +# system, we see that the tuned model runs 47% faster than the untuned model: +# +# .. code-block:: bash # -# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to -# customize your tuning process, in terms of number of repetitions (``--repeat`` and -# ``--number``, for example), the tuning algorithm to be use, and so on. -# Check ``tvmc tune --help`` for more information. +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm_autotuned.tar # -# As an output of the tuning process above, we obtained the tuning records stored -# in ``autotuner_records.json``. This file can be used in two ways: +# # Execution time summary: +# # mean (s) max (s) min (s) std (s) +# # 0.09219 0.11573 0.08985 0.00315 # -# - as an input to further tuning (via ``tvmc tune --tuning-records``), or -# - as an input to the compiler +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm.tar # -# The compiler will use the results to generate high performance code for the model -# on your specified target. To do that we can use ``tvmc compile --tuning-records``. -# Check ``tvmc compile --help`` for more information. +# # Execution time summary: +# # mean (s) max (s) min (s) std (s) +# # 0.19332 0.21997 0.18504 0.00711 # -###################################################################### +################################################################################ # Final Remarks # ------------- # -# In this tutorial, we presented TVMC, a command line driver for TVM. -# We demonstrated how to compile, run and tune a model, as well -# as discussed the need for pre and post processing of inputs and outputs. +# In this tutorial, we presented TVMC, a command line driver for TVM. We +# demonstrated how to compile, run, and tune a model. We also discussed the +# need for pre and post-processing of inputs and outputs. After the tuning +# process, we demonstrated how to compare the performance of the unoptimized +# and optimize models. # # Here we presented a simple example using ResNet 50 V2 locally. However, TVMC # supports many more features including cross-compilation, remote execution and # profiling/benchmarking. # -# To see what other options are available, please have a look at ``tvmc --help``. +# To see what other options are available, please have a look at ``tvmc +# --help``. # +# In the next tutorial, `Compiling and Optimizing a Model with the Python +# AutoScheduler `_, we will cover the same compilation +# and optimization steps using the Python interface. From b8b63cd1a40250cdb12c14050b3b1f545eeafa70 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 16 Mar 2021 15:29:59 -0700 Subject: [PATCH 28/84] add nvcc support (#7668) --- python/tvm/contrib/cc.py | 16 ++++++++++------ python/tvm/runtime/module.py | 3 +++ src/relay/backend/contrib/codegen_c/codegen_c.h | 2 ++ src/target/source/source_module.cc | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py index 59a1d11216eef..f48ae395fbcd6 100644 --- a/python/tvm/contrib/cc.py +++ b/python/tvm/contrib/cc.py @@ -192,12 +192,16 @@ def _fcompile(outputs, objects, options=None): def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=False): cmd = [compile_cmd] - if compile_shared or output.endswith(".so") or output.endswith(".dylib"): - cmd += ["-shared", "-fPIC"] - if sys.platform == "darwin": - cmd += ["-undefined", "dynamic_lookup"] - elif output.endswith(".obj"): - cmd += ["-c"] + if compile_cmd != "nvcc": + if compile_shared or output.endswith(".so") or output.endswith(".dylib"): + cmd += ["-shared", "-fPIC"] + if sys.platform == "darwin": + cmd += ["-undefined", "dynamic_lookup"] + elif output.endswith(".obj"): + cmd += ["-c"] + else: + if compile_shared or output.endswith(".so") or output.endswith(".dylib"): + cmd += ["--shared"] cmd += ["-o", output] if isinstance(objects, str): cmd += [objects] diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 53576a60f32f5..09bef9ecbd6af 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -339,6 +339,9 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No else: assert module.type_key == "c" object_format = "c" + if "cc" in kwargs: + if kwargs["cc"] == "nvcc": + object_format = "cu" has_c_module = True path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}") module.save(path_obj) diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h index af835cfca02e8..b81fd14b99c22 100644 --- a/src/relay/backend/contrib/codegen_c/codegen_c.h +++ b/src/relay/backend/contrib/codegen_c/codegen_c.h @@ -343,6 +343,8 @@ class CodegenCBase { std::string dtype; if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) { dtype = "float"; + } else if (runtime::TypeMatch(ttype->dtype, kDLFloat, 16)) { + dtype = "half"; } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) { dtype = "int"; } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) { diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index a7732719a699a..26f1850c0e475 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -104,7 +104,7 @@ class CSourceModuleNode : public runtime::ModuleNode { void SaveToFile(const std::string& file_name, const std::string& format) final { std::string fmt = GetFileFormat(file_name, format); std::string meta_file = GetMetaFilePath(file_name); - if (fmt == "c") { + if (fmt == "c" || fmt == "cu") { ICHECK_NE(code_.length(), 0); SaveBinaryToFile(file_name, code_); } else { From e697f03539acce7a6ed78eb7757c207398dd6b72 Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Tue, 16 Mar 2021 15:30:32 -0700 Subject: [PATCH 29/84] Fix relay.testing.darknet convert_image (#7667) --- python/tvm/relay/testing/darknet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py index c0468b7ef6925..e1345043c6bb8 100644 --- a/python/tvm/relay/testing/darknet.py +++ b/python/tvm/relay/testing/darknet.py @@ -31,7 +31,7 @@ def convert_image(image): """Convert the image with numpy.""" imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - imagex = np.array(image) + imagex = np.array(imagex) imagex = imagex.transpose((2, 0, 1)) imagex = np.divide(imagex, 255.0) imagex = np.flip(imagex, 0) From 4abbe4902e451cc5a963b8b60a70e548d48ace62 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 16 Mar 2021 21:21:21 -0700 Subject: [PATCH 30/84] [Torch] Remove unnecessary reshapes for batch_matmul (#7675) * [Torch] Remove unnecessary reshapes for batch_matmul * lint * fix * reorder * lint --- python/tvm/relay/frontend/pytorch.py | 29 +++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index c709e2b4e7bd5..fd0a07e35c155 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1094,8 +1094,7 @@ def instance_norm(self, inputs, input_types): data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale ) - @staticmethod - def get_dims(data): + def get_dims(self, data): import torch if isinstance(data, _expr.Expr): @@ -1575,15 +1574,31 @@ def matmul(self, inputs, input_types): # When performing a batch matmul, we need to properly handle N-dim shapes. if len(a_shape) > 2 or len(b_shape) > 2: - # Convert a and b into 3 dimensional tensors. - a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]]) - b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]]) + # Convert a into a 3 dimensional tensors. + need_reshape_output = False + if len(a_shape) != 3: + a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]]) + need_reshape_output = True + else: + a = inputs_0 + # Transpose matrix dimensions of b. - b = _op.transpose(b, [0, 2, 1]) + trans_axes = list(range(len(b_shape))) + trans_axes[-2], trans_axes[-1] = trans_axes[-1], trans_axes[-2] + b = _op.transpose(inputs_1, trans_axes) + + # Convert b into a 3 dimensional tensor. Note that the last two dimensions + # are transposed. + if len(b_shape) != 3: + b = _op.reshape(b, [-1, b_shape[-1], b_shape[-2]]) + # Perform a batch matmul. output = _op.nn.batch_matmul(a, b) + # Reshape output to original dimensions. - return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]]) + if need_reshape_output: + return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]]) + return output # Otherwise a simple dense op will get the job done. if len(b_shape) == 1: From c55608f2a123541f8f38bf71267a20831f6f7678 Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 17 Mar 2021 22:29:01 +0900 Subject: [PATCH 31/84] [SPIRV] Declare int64 capability by default (#7681) --- src/target/spirv/ir_builder.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc index 3a9de4e077dc9..5a1457387ae58 100644 --- a/src/target/spirv/ir_builder.cc +++ b/src/target/spirv/ir_builder.cc @@ -48,6 +48,8 @@ void IRBuilder::InitHeader() { header_.push_back(0U); // shader ib_.Begin(spv::OpCapability).Add(spv::CapabilityShader).Commit(&header_); + // Declare int64 capability by default + ib_.Begin(spv::OpCapability).Add(spv::CapabilityInt64).Commit(&header_); // memory model ib_.Begin(spv::OpMemoryModel) .AddSeq(spv::AddressingModelLogical, spv::MemoryModelGLSL450) From 60ff0c79fa90230d8e1fd4d5497229c5f0688e42 Mon Sep 17 00:00:00 2001 From: zhuochen Date: Thu, 18 Mar 2021 00:39:05 +0800 Subject: [PATCH 32/84] [Runtime] Extend Graph Runtime To Support Cuda Graph Launch (#7616) * add graph runtime cuGraph poc * lint format * add unittest * fix review comments * Update CMakeLists.txt Co-authored-by: Cody Yu * build cuda graph runtime in gpu test * Revert "build cuda graph runtime in gpu test" This reverts commit f286711e4126c696860be3ec3d82400ca8542bd5. * rename cuGraph to CUDA Graph * rename cuda_graph * rename cuda_graph * lint format * Update src/runtime/graph/graph_runtime_factory.cc Co-authored-by: Cody Yu * Update python/tvm/testing.py Co-authored-by: Cody Yu * fix lint error * remove unnecessary warn * add test, fix lint * fix lint W0223 Co-authored-by: Cody Yu --- CMakeLists.txt | 1 + cmake/config.cmake | 3 + cmake/modules/CUDA.cmake | 11 ++ python/tvm/contrib/cuda_graph/__init__.py | 16 +++ .../contrib/cuda_graph/cuda_graph_runtime.py | 134 +++++++++++++++++ python/tvm/contrib/nvcc.py | 12 ++ python/tvm/testing.py | 19 +++ .../cuda_graph/graph_runtime_cuda_graph.cc | 135 ++++++++++++++++++ src/runtime/graph/graph_runtime_factory.cc | 33 +++++ src/runtime/graph/graph_runtime_factory.h | 8 ++ .../unittest/test_runtime_graph_cuda_graph.py | 100 +++++++++++++ .../test_runtime_module_based_interface.py | 30 ++++ 12 files changed, 502 insertions(+) create mode 100644 python/tvm/contrib/cuda_graph/__init__.py create mode 100644 python/tvm/contrib/cuda_graph/cuda_graph_runtime.py create mode 100644 src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc create mode 100644 tests/python/unittest/test_runtime_graph_cuda_graph.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 451b6a7ee2c28..16968ce41f709 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ tvm_option(USE_THREADS "Build with thread support" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) +tvm_option(USE_GRAPH_RUNTIME_CUDA_GRAPH "Build with tiny graph runtime with CUDA Graph for GPUs" OFF) tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON) tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF) tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF) diff --git a/cmake/config.cmake b/cmake/config.cmake index 65859566a6640..60c718c97bc14 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -99,6 +99,9 @@ set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph runtime. set(USE_GRAPH_RUNTIME ON) +# Whether enable tiny graph runtime with CUDA Graph +set(USE_GRAPH_RUNTIME_CUDA_GRAPH OFF) + # Whether to enable the profiler for the graph runtime and vm set(USE_PROFILER ON) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 0ec2f1466bd1f..262a4e6e71239 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -65,6 +65,17 @@ if(USE_CUDA) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) endif(USE_THRUST) + if(USE_GRAPH_RUNTIME_CUDA_GRAPH) + if(NOT USE_GRAPH_RUNTIME) + message(FATAL_ERROR "CUDA Graph is only supported by graph runtime, please set USE_GRAPH_RUNTIME=ON") + endif() + if(CUDAToolkit_VERSION_MAJOR LESS "10") + message(FATAL_ERROR "CUDA Graph requires CUDA 10 or above, got=" ${CUDAToolkit_VERSION}) + endif() + message(STATUS "Build with Graph runtime with CUDA Graph support...") + file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph/cuda_graph/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_GRAPH_SRCS}) + endif() else(USE_CUDA) list(APPEND COMPILER_SRCS src/target/opt/build_cuda_off.cc) endif(USE_CUDA) diff --git a/python/tvm/contrib/cuda_graph/__init__.py b/python/tvm/contrib/cuda_graph/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/python/tvm/contrib/cuda_graph/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py new file mode 100644 index 0000000000000..45ec89d37b3d8 --- /dev/null +++ b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Graph runtime with CUDA Graph""" +import tvm._ffi + +from tvm._ffi.base import string_types +from tvm.contrib import graph_runtime + + +def create(graph_json_str, libmod, ctx): + """Create a runtime executor module given a graph and module. + + Parameters + ---------- + graph_json_str : str + The graph to be deployed in json format output by json graph. + The graph can contain operator(tvm_op) that points to the name + of PackedFunc in the libmod. + + libmod : tvm.runtime.Module + The module of the corresponding function + + ctx : TVMContext + The context to deploy the module, only supports CUDA GPU + + Returns + ------- + graph_module : GraphModuleCudaGraph + CUDA graph runtime module that can be used to execute the graph. + + Note + ---- + See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_runtime.GraphModuleCudaGraph` + for examples to directly construct a GraphModuleCudaGraph from an exported + relay compiled library. + """ + assert isinstance(graph_json_str, string_types) + try: + ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx) + if num_rpc_ctx == len(ctx): + fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create") + else: + fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cuda_graph.create") + except ValueError: + raise ValueError( + "To enable CUDA graph support (experimental), please set " + "'(USE_GRAPH_RUNTIME_CUGRAPH ON)' in config.cmake and rebuild TVM" + ) + + return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id)) + + +class GraphModuleCudaGraph(graph_runtime.GraphModule): + """CUDA graph runtime module. + + This is a CUDA graph runtime wrapper over the TVM runtime. + Runtime interfaces are wrapped with CUDA graph functionalities. + + Parameters + ---------- + module : Module + The internal tvm module that holds the actual graph functions. + """ + + def __init__(self, module): + self._start_capture = module["start_capture"] + self._end_capture = module["end_capture"] + self._run_cuda_graph = module["run_cuda_graph"] + self._cuda_graph_captured = False + graph_runtime.GraphModule.__init__(self, module) + + def capture_cuda_graph(self): + """Capture a CUDA graph for tvm_op graph + + This should be called before run_cuda_graph() to capture and + instantiate a CUDA graph instance. + """ + self._run() # call cuModuleLoadData before cudaStream API + self._start_capture() + self._run() + self._end_capture() + self._cuda_graph_captured = True + + def run_cuda_graph(self): + """Run the CUDA graph for tvm_op graph + + Run the captured CUDA graph instance instead of the + for-loop kernel launch of default graph runtime + """ + self._run_cuda_graph() + + def run(self, **input_dict): + """A run wrapper for graph capture / launch, user can just + change default graph runtime to cuda graph runtime, and + the first call will capture a cuda graph for future launch + + Parameters + ---------- + input_dict: dict of str to NDArray + List of input values to be feed to + """ + if input_dict: + self.set_input(**input_dict) + if not self._cuda_graph_captured: + self.capture_cuda_graph() + else: + self._run_cuda_graph() + + def debug_get_output(self, node, out): + """Run graph up to node and get the output to out + + Parameters + ---------- + node : int / str + The node index or name + + out : NDArray + The output array container + """ + raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.") diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py index 7e49f55e8d327..99844f799d7ab 100644 --- a/python/tvm/contrib/nvcc.py +++ b/python/tvm/contrib/nvcc.py @@ -349,6 +349,18 @@ def have_tensorcore(compute_version=None, target=None): return False +def have_cudagraph(): + """Either CUDA Graph support is provided""" + try: + cuda_path = find_cuda_path() + cuda_ver = get_cuda_version(cuda_path) + if cuda_ver < 10.0: + return False + return True + except RuntimeError: + return False + + def have_bf16(compute_version): """Either bf16 support is provided in the compute capability or not diff --git a/python/tvm/testing.py b/python/tvm/testing.py index d65ab23677b5b..1cb43b29c521d 100644 --- a/python/tvm/testing.py +++ b/python/tvm/testing.py @@ -514,6 +514,25 @@ def requires_cuda(*args): return _compose(args, _requires_cuda) +def requires_cudagraph(*args): + """Mark a test as requiring the CUDA Graph Feature + + This also marks the test as requiring cuda + + Parameters + ---------- + f : function + Function to mark + """ + _requires_cudagraph = [ + pytest.mark.skipif( + not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment" + ), + *requires_cuda(), + ] + return _compose(args, _requires_cudagraph) + + def requires_opencl(*args): """Mark a test as requiring the OpenCL runtime. diff --git a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc new file mode 100644 index 0000000000000..ee5e50a3b9d4b --- /dev/null +++ b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file graph_runtime_cuda_graph.cc + */ + +#include + +#include "../../cuda/cuda_common.h" +#include "../graph_runtime.h" + +namespace tvm { +namespace runtime { + +/*! + * \brief Graph runtime with CUDA Graph Support. + * + * This is the extension of GraphRuntime class used for CUDA graph launch + * instead of CUDA kernel launch. CUDA graph launch requires CUDA 10.0 or + * above, currently there are two ways of constructing CUDA graphs: + * (1) Using CUDA stream capture API to capture a series of operations on + * CUDA stream, and automatically generates a graph (2) Building a graph + * using CUDA graph API manually. This implementation uses stream capture. + */ +class GraphRuntimeCudaGraph : public GraphRuntime { + public: + /*! + * \brief Begin CUDA graph capture on stream, the stream enters capture mode. + */ + void StartCapture() { + const TVMContext& ctx = data_entry_[entry_id(0, 0)]->ctx; + + TVMStreamCreate(ctx.device_type, ctx.device_id, &capture_stream_); + TVMSetStream(ctx.device_type, ctx.device_id, capture_stream_); + + CUDA_CALL(cudaStreamBeginCapture(static_cast(capture_stream_), + cudaStreamCaptureModeGlobal)); + } + + /*! + * \brief Launch the instantiated graph on stream + */ + void RunCudaGraph() { + cudaStream_t cuStream = static_cast(capture_stream_); + CUDA_CALL(cudaGraphLaunch(cuda_graph_exec_, cuStream)); + CUDA_CALL(cudaStreamSynchronize(cuStream)); + } + + /*! + * \brief End CUDA graph capture on stream, a graph will be created and + * instantiated. + */ + void EndCapture() { + cudaGraph_t graph; + CUDA_CALL(cudaStreamEndCapture(static_cast(capture_stream_), &graph)); + + cudaGraphNode_t* nodes = NULL; + size_t numNodes = 0; + CUDA_CALL(cudaGraphGetNodes(graph, nodes, &numNodes)); + LOG(INFO) << "Num of nodes in the cuda graph created using stream capture API = " << numNodes; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec_, graph, NULL, NULL, 0)); + } + + /*! + * \brief GetFunction Get the function based on input. + * \param name The function which needs to be invoked. + * \param sptr_to_self Packed function pointer. + */ + PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self); + + private: + /*! \brief The Cuda stream on which to capture a CUDA graph. */ + TVMStreamHandle capture_stream_; + /*! \brief The captured CUDA graph will be instantiated to this. */ + cudaGraphExec_t cuda_graph_exec_; +}; + +PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { + if (name == "run_cuda_graph") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->RunCudaGraph(); }); + } else if (name == "start_capture") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->StartCapture(); }); + } else if (name == "end_capture") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->EndCapture(); }); + } else { + return GraphRuntime::GetFunction(name, sptr_to_self); + } +} + +Module GraphRuntimeCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m, + const std::vector& ctxs, + PackedFunc lookup_linked_param_func) { + auto exec = make_object(); + exec->Init(sym_json, m, ctxs, lookup_linked_param_func); + return Module(exec); +} + +TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create") + .set_body([](TVMArgs args, TVMRetValue* rv) { + ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " + "at least 4, but it has " + << args.num_args; + PackedFunc lookup_linked_param_func; + int ctx_start_arg = 2; + if (args[2].type_code() == kTVMPackedFuncHandle) { + lookup_linked_param_func = args[2]; + ctx_start_arg++; + } + + *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllContext(args, ctx_start_arg), + lookup_linked_param_func); + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc index 605d6b0ce892d..1682afa8464a0 100644 --- a/src/runtime/graph/graph_runtime_factory.cc +++ b/src/runtime/graph/graph_runtime_factory.cc @@ -72,6 +72,14 @@ PackedFunc GraphRuntimeFactory::GetFunction( exec->Import(this->imports_[0]); *rv = Module(exec); }); + } else if (name == "cuda_graph_create") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + std::vector contexts; + for (int i = 0; i < args.num_args; ++i) { + contexts.emplace_back(args[i].operator TVMContext()); + } + *rv = this->CudaGraphRuntimeCreate(contexts); + }); } else { return PackedFunc(); } @@ -130,6 +138,31 @@ Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& ct return mod; } +Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& ctxs) { + const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_cuda_graph.create"); + ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_cuda_graph.create in registry. " + "Did you set(USE_GRAPH_RUNTIME_CUGRAPH=ON)?"; + std::vector unpacked_ctxs; + for (const auto& ctx : ctxs) { + unpacked_ctxs.emplace_back(ctx.device_type); + unpacked_ctxs.emplace_back(ctx.device_id); + } + size_t args_size = unpacked_ctxs.size() + 2; + std::vector values(args_size); + std::vector codes(args_size); + runtime::TVMArgsSetter setter(values.data(), codes.data()); + setter(0, this->graph_json_); + setter(1, this->imports_[0]); + for (size_t i = 0; i < unpacked_ctxs.size(); ++i) { + setter(i + 2, unpacked_ctxs[i]); + } + TVMRetValue rv; + pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); + Module mod = rv.operator Module(); + SetParams(const_cast(mod.as()), this->params_); + return mod; +} + Module GraphRuntimeFactoryModuleLoadBinary(void* strm) { dmlc::Stream* stream = static_cast(strm); std::string graph_json; diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph/graph_runtime_factory.h index 98fb27c43ea21..f2f11ee66802a 100644 --- a/src/runtime/graph/graph_runtime_factory.h +++ b/src/runtime/graph/graph_runtime_factory.h @@ -89,6 +89,14 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { */ Module DebugRuntimeCreate(const std::vector& ctxs); + /*! + * \brief Create a specific cuda graph runtime module + * \param ctxs The context of the host and devices where graph nodes will be + * executed on. + * \return created cuda graph runtime module + */ + Module CudaGraphRuntimeCreate(const std::vector& ctx); + /*! * \brief Set params. * \param graph_runtime The graph runtime we want to set the params into. diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py new file mode 100644 index 0000000000000..4a31873cb93cb --- /dev/null +++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json +import os +import re +import sys +import time + +import pytest + +import tvm +import tvm.testing +from tvm import te +import numpy as np + +from tvm.contrib import utils, graph_runtime +from tvm.contrib.cuda_graph import cuda_graph_runtime + + +bx = te.thread_axis("blockIdx.x") +tx = te.thread_axis("threadIdx.x") + + +@tvm.testing.requires_cudagraph +def test_graph_simple(): + n = 32 + A = te.placeholder((n,), name="A") + B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") + s = te.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=8) + s[B].bind(xo, bx) + s[B].bind(xi, tx) + + node0 = {"op": "null", "name": "x", "inputs": []} + node1 = { + "op": "tvm_op", + "name": "add", + "inputs": [[0, 0, 0]], + "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"}, + } + nodes = [node0, node1] + arg_nodes = [0] + node_row_ptr = [0, 1, 2] + outputs = [[1, 0, 0]] + shape = (n,) + attrs = { + "shape": ["list_shape", [shape, shape]], + "dltype": ["list_str", ["float32", "float32"]], + "storage_id": ["list_int", [0, 1]], + } + graph = { + "nodes": nodes, + "arg_nodes": arg_nodes, + "node_row_ptr": node_row_ptr, + "heads": outputs, + "attrs": attrs, + } + graph = json.dumps(graph) + + def check_verify(): + mlib = tvm.build(s, [A, B], "cuda", name="myadd") + ctx = tvm.gpu(0) + try: + mod = cuda_graph_runtime.create(graph, mlib, ctx) + except ValueError: + return + + for i in range(3): + a = np.random.uniform(size=(n,)).astype(A.dtype) + mod.run(x=a) # The first run captured a CUDA graph + out = mod.get_output(0, tvm.nd.empty((n,))) + np.testing.assert_equal(out.asnumpy(), a + 1) + + # capture / run CUDA graph manually + mod.capture_cuda_graph() + a = np.random.uniform(size=(n,)).astype(A.dtype) + mod.set_input(x=a) + mod.run_cuda_graph() + out = mod.get_output(0, tvm.nd.empty((n,))) + np.testing.assert_equal(out.asnumpy(), a + 1) + + check_verify() + + +if __name__ == "__main__": + test_graph_simple() diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py index a34fe4a062cb6..930011d4fd333 100644 --- a/tests/python/unittest/test_runtime_module_based_interface.py +++ b/tests/python/unittest/test_runtime_module_based_interface.py @@ -20,6 +20,7 @@ import tvm from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime +from tvm.contrib.cuda_graph import cuda_graph_runtime import tvm.testing @@ -538,6 +539,35 @@ def test_debug_graph_runtime(): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) +@tvm.testing.requires_cudagraph +def test_cuda_graph_runtime(): + mod, params = relay.testing.synthetic.get_workload() + with tvm.transform.PassContext(opt_level=3): + complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) + data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") + + ctx = tvm.gpu() + try: + gmod = complied_graph_lib["cuda_graph_create"](ctx) + except: + print("Skip because cuda_graph not enabled") + return + set_input = gmod["set_input"] + run = gmod["run"] + get_output = gmod["get_output"] + set_input("data", tvm.nd.array(data)) + run() + out = get_output(0).asnumpy() + tvm.testing.assert_allclose(out, verify(data), atol=1e-5) + + # cuda graph runtime wrapper + cu_gmod = cuda_graph_runtime.GraphModuleCudaGraph(gmod) + cu_gmod.set_input("data", data) + cu_gmod.run() + out = cu_gmod.get_output(0).asnumpy() + tvm.testing.assert_allclose(out, verify(data), atol=1e-5) + + def test_multiple_imported_modules(): def make_func(symbol): n = tvm.te.size_var("n") From 5c460ff45a31939e8e9c7e83cf00184053809788 Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Wed, 17 Mar 2021 10:59:34 -0700 Subject: [PATCH 33/84] [COMMUNITY] @areusch -> Committer (#7679) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f72220d07f16f..eb2af2151acc8 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -60,6 +60,7 @@ We do encourage everyone to work anything they are interested in. - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta - [Kazutaka Morita](https://github.com/kazum): @kazum - frontends, opencl - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm +- [Andrew Reusch](https://github.com/areusch): @areusch - runtime, µTVM - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang From c871784f3eaaab960b6d29ac9b31bb287eb9e588 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 18 Mar 2021 02:45:49 +0800 Subject: [PATCH 34/84] [Frontend,TOPI] Improve dynamism for BatchMatmul and Dense (#7496) * [TOPI] Dense cuda schedule support dynamic dimension * [TOPI] batch_matmul cublas te computation support dynamism * [Frontend] tensorflow frontend: dynamic support for BatchMatmul * [TOPI] nn batch_matmul te computation support dynamism * fix CI * Update python/tvm/topi/nn/batch_matmul.py Co-authored-by: Cody Yu * Update python/tvm/topi/cuda/batch_matmul.py Co-authored-by: Cody Yu * remove concat_dynamic_shape function * update topi dense op integer checking * fix ci * Update python/tvm/relay/frontend/tensorflow.py Co-authored-by: Cody Yu * Update batch_matmul.py * [Frontend] add test for batch_matmul in dynamic shaped case Co-authored-by: Cody Yu --- python/tvm/relay/frontend/tensorflow.py | 54 +++++++++++++++---- python/tvm/topi/cuda/batch_matmul.py | 7 +-- python/tvm/topi/cuda/dense.py | 11 ++-- .../frontend/tensorflow/test_forward.py | 52 +++++++++++++++++- 4 files changed, 104 insertions(+), 20 deletions(-) diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index f56d187b6a632..1946223a50a41 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -44,6 +44,17 @@ __all__ = ["from_tensorflow"] +def check_symbolic_shape(shape): + return not all([isinstance(dim, (int, tvm.tir.IntImm)) for dim in shape]) + + +def list_shape_of(tensor, ndim): + shape_tensor = _op.shape_of(tensor) + return [ + _op.strided_slice(shape_tensor, begin=[i], end=[i + 1], strides=[1]) for i in range(ndim) + ] + + def _get_pad_pair(input1d, kernel1d, stride1d): if input1d % stride1d == 0: pad = max(kernel1d - stride1d, 0) @@ -1022,13 +1033,31 @@ def _impl(inputs, attr, params, mod): input_y = inputs[1] orig_shape_x = _infer_shape(input_x, mod) orig_shape_y = _infer_shape(input_y, mod) + ndim = len(orig_shape_x) + + is_static = not check_symbolic_shape(orig_shape_x) + + if ndim > 3 and not is_static: + shape_of_x = list_shape_of(inputs[0], ndim) + shape_of_y = list_shape_of(inputs[1], ndim) # reshape n-dimensional batch matmul into 3d - if len(orig_shape_x) > 3: + if ndim > 3: outer_dims = [orig_shape_x[i] for i in range(0, len(orig_shape_x) - 2)] - num_outer_elts = np.prod(outer_dims) - new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1]) - new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1]) + if is_static: + num_outer_elts = np.prod(outer_dims) + new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1]) + new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1]) + else: # handle dynamic shape (dyn.reshape op) + # new shape = [prod(shape[:-2]), -2, -1] + new_shape_x = [_op.const(1), shape_of_x[-2], shape_of_x[-1]] + new_shape_y = [_op.const(1), shape_of_y[-2], shape_of_y[-1]] + for i in range(ndim - 2): + new_shape_x[0] *= shape_of_x[i] + new_shape_y[0] *= shape_of_y[i] + new_shape_x = _op.concatenate(_op.Tuple(new_shape_x), axis=0) + new_shape_y = _op.concatenate(_op.Tuple(new_shape_y), axis=0) + input_x = _op.reshape(input_x, newshape=new_shape_x) input_y = _op.reshape(input_y, newshape=new_shape_y) @@ -1039,12 +1068,19 @@ def _impl(inputs, attr, params, mod): ret = get_relay_op("batch_matmul")(input_x, input_y) # reshape result back to n-dimensional - if len(orig_shape_x) > 3: - final_shape = list(orig_shape_x) - final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2] - final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1] - ret = _op.reshape(ret, newshape=final_shape) + if ndim > 3: + if is_static: + final_shape = list(orig_shape_x) + final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2] + final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1] + else: + # calculate the resulting shape = [shape[:-2], 0, 0] + final_shape = list(shape_of_x) + final_shape[-2] = shape_of_x[-1] if adj_x else shape_of_x[-2] + final_shape[-1] = shape_of_y[-2] if adj_y else shape_of_y[-1] + final_shape = _op.concatenate(_op.Tuple(final_shape), axis=0) + ret = _op.reshape(ret, newshape=final_shape) return ret return _impl diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index 006b866d6bade..04e484f526d23 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -159,9 +159,10 @@ def batch_matmul_cublas(cfg, x, y, out_shape=None): output : tvm.te.Tensor 3-D with shape [batch, M, N] """ - b, m, k = x.shape - b, n, k = y.shape - cfg.add_flop(b * m * k * n * 2) + b, m, k = get_const_tuple(x.shape) + b, n, k = get_const_tuple(y.shape) + if all([isinstance(s, int) for s in [b, m, n, k]]): + cfg.add_flop(b * m * k * n * 2) return cublas.batch_matmul(x, y, False, True) diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index ad4882ab09f21..8adc38b84b1ba 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -17,7 +17,7 @@ # pylint: disable=invalid-name, unused-argument """Schedule for dense operator""" import logging -from tvm import te, tir +from tvm import te import tvm.autotvm as autotvm from tvm.autotvm.task.space import SplitEntity from tvm.contrib import cublas @@ -39,14 +39,11 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype assert out_dtype == data.dtype, "Mixed precision not supported." - batch, in_dim = data.shape - out_dim, _ = weight.shape + batch, in_dim = get_const_tuple(data.shape) + out_dim, _ = get_const_tuple(weight.shape) matmul = cublas.matmul(data, weight, False, True) - if isinstance(batch, int): + if all(isinstance(d, int) for d in [batch, in_dim, out_dim]): cfg.add_flop(batch * in_dim * out_dim * 2) - elif isinstance(batch, tir.IntImm): - cfg.add_flop(batch.value * in_dim * out_dim * 2) - # if we get a te.Var, we cannot add flop counts if bias is not None: matmul = te.compute( (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index fa27dee37699b..22afe8f88f663 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -210,6 +210,7 @@ def compare_tf_with_tvm( mode="graph_runtime", cuda_layout="NCHW", add_shapes_to_graph_def=True, + targets=None, ): """Generic function to generate and compare tensorflow and TVM output""" @@ -233,13 +234,18 @@ def name_without_num(name): tf_output = run_tf_graph(sess, in_data, in_name, out_name) - for device in ["llvm", "cuda"]: + devices = targets if targets else ["llvm", "cuda"] + + for device in devices: ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue if no_gpu and device == "cuda": continue + if "cublas" in device and not tvm.get_global_func("tvm.contrib.cublas.matmul", True): + print("Skip because cublas is not enabled: %s" % device) + continue tvm_output = run_tvm_graph( final_graph_def, @@ -1781,6 +1787,23 @@ def _test_batch_matmul(A_shape, B_shape, dtype, adjoint_a=False, adjoint_b=False compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name) +def _test_batch_matmul_dynamic( + A_shape, B_shape, A_np_shape, B_np_shape, dtype, adjoint_a=False, adjoint_b=False +): + with tf.Graph().as_default(): + A = tf.placeholder(shape=A_shape, dtype=dtype, name="A") + B = tf.placeholder(shape=B_shape, dtype=dtype, name="B") + result = tf.matmul(A, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name="batchmatmul") + + A_np = np.random.uniform(high=5.0, size=A_np_shape).astype(dtype) + B_np = np.random.uniform(high=5.0, size=B_np_shape).astype(dtype) + # for now, in TOPI, only cublas's implementation support dynamic shape + # TODO add more backends support in TOPI + compare_tf_with_tvm( + [A_np, B_np], [A.name, B.name], result.name, mode="vm", targets=["cuda -libs=cublas"] + ) + + def test_forward_batch_matmul(): """ TF op BatchMatMul, BatchMatMulV2 test""" _test_batch_matmul((3, 5, 4), (3, 4, 5), "int32") @@ -1793,6 +1816,33 @@ def test_forward_batch_matmul(): _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True) +@tvm.testing.requires_cuda +def test_forward_batch_matmul_dynamic(): + _test_batch_matmul_dynamic((None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "int32") + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "float32", True, True + ) + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "int32", True, False + ) + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "float32", False, True + ) + _test_batch_matmul_dynamic( + (None, 4, 5, 6), (None, 4, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32" + ) + _test_batch_matmul_dynamic( + (None, None, 5, 6), (None, None, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32" + ) + _test_batch_matmul_dynamic( + (None, None, None, 5, 6), + (None, None, None, 6, 5), + (2, 3, 4, 5, 6), + (2, 3, 4, 6, 5), + "float32", + ) + + ####################################################################### # SparseTensorDenseMatMul # ---------------------------------- From ab86aa69e2337cb2f9ead6105f58fed90d7023e2 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Wed, 17 Mar 2021 21:54:53 -0700 Subject: [PATCH 35/84] [Relay][QNN] Relax simulated qnn tests to prevent flakiness. (#7684) * Relax simulated qnn tests to prevent flakiness. * Change name of helper to make pytest happy. --- .../relay/test_op_qnn_simulated_dequantize.py | 10 +++++----- .../relay/test_op_qnn_simulated_quantize.py | 17 ++++++++++++----- tests/python/topi/python/test_topi_qnn.py | 8 ++++++-- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py index 0cc04e4998eb2..a9333c916561e 100644 --- a/tests/python/relay/test_op_qnn_simulated_dequantize.py +++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py @@ -81,7 +81,7 @@ def verify_simulated_dequantize_simple(dtype): dtype = relay.var("dtype", shape=[]) vm = build_simulated_dequantize(input_data, scale, zp, dtype) sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) def test_simulated_dequantize(): @@ -112,7 +112,7 @@ def test_dynamic_channels(): dtype = relay.var("dtype", shape=[]) vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0) sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) # Now get the perchannel quantize output and compare without recompiling. scale_np = np.array([0.5, 0.25]).astype("float32") @@ -128,7 +128,7 @@ def test_dynamic_channels(): ) # Run the simulated quantize without recompiling and confirm results match. sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) def test_dynamic_dtype(): @@ -153,7 +153,7 @@ def test_dynamic_dtype(): dtype = relay.var("dtype", shape=[]) vm = build_simulated_dequantize(input_data, scale, zp, dtype) sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) # Now test int8 to float32 compilation. data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8") @@ -168,7 +168,7 @@ def test_dynamic_dtype(): # Run the simulated quantize without recompiling and confirm results match. dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"]) sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) if __name__ == "__main__": diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py index ee4ba209dcb8f..c0fa0648d8790 100644 --- a/tests/python/relay/test_op_qnn_simulated_quantize.py +++ b/tests/python/relay/test_op_qnn_simulated_quantize.py @@ -24,6 +24,13 @@ from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE +def allclose_with_rounding(a, b): + # Find number of mismatches in inputs. + mismatch = a != b + # Allow some rounding errors due to GPU fp32 arithmetic. + assert np.sum(mismatch) <= 3 + + def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data): shape = in_data.shape input_data = relay.var("input_data", shape=shape, dtype=in_dtype) @@ -82,7 +89,7 @@ def verify_simulated_quantize_simple(dtype): dtype = relay.var("dtype", shape=[]) vm = build_simulated_quantize(input_data, scale, zp, dtype) sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) def test_simulated_quantize(): @@ -113,7 +120,7 @@ def test_dynamic_channels(): dtype = relay.var("dtype", shape=[]) vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0) sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) # Now get the perchannel quantize output and compare without recompiling. scale_np = np.array([0.5, 0.25]).astype("float32") @@ -130,7 +137,7 @@ def test_dynamic_channels(): ) # Run the simulated quantize without recompiling and confirm results match. sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) def test_dynamic_dtype(): @@ -155,7 +162,7 @@ def test_dynamic_dtype(): dtype = relay.var("dtype", shape=[]) vm = build_simulated_quantize(input_data, scale, zp, dtype) sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) # Now test float32 to int32 compilation. # Get the reference quantize output. @@ -169,7 +176,7 @@ def test_dynamic_dtype(): # Run the simulated quantize without recompiling and confirm results match. dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"]) sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) - np.testing.assert_equal(sim_q_out.asnumpy(), q_out) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py index a63f34fe08d09..386f77335f1a7 100644 --- a/tests/python/topi/python/test_topi_qnn.py +++ b/tests/python/topi/python/test_topi_qnn.py @@ -72,7 +72,9 @@ def check_device(device, ctx): func(a, d, s, z, q) # Check correctness against the true qnn output. - tvm.testing.assert_allclose(q.asnumpy(), real_q_out.asnumpy().astype("float32")) + mismatch = q.asnumpy() != real_q_out.asnumpy().astype("float32") + # Allow some rounding errors due to GPU fp32 arithmetic. + assert np.sum(mismatch) <= 3 for target, ctx in tvm.testing.enabled_targets(): check_device(target, ctx) @@ -137,7 +139,9 @@ def check_device(device, ctx): func(a, d, s, z, dq) # Check correctness against the true qnn output. - tvm.testing.assert_allclose(dq.asnumpy(), real_dq_out.asnumpy().astype("float32")) + tvm.testing.assert_allclose( + dq.asnumpy(), real_dq_out.asnumpy().astype("float32"), rtol=1e-5 + ) for target, ctx in tvm.testing.enabled_targets(): check_device(target, ctx) From 4976bb2a2f90ecfdad7306257daa4e5948af74e0 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 17 Mar 2021 22:08:51 -0700 Subject: [PATCH 36/84] [Relay] Add TopPattern to nn.dropout (#7685) --- src/relay/op/nn/nn.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 0ea71de367fac..b2404cc1954b2 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -590,6 +590,7 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input .set_num_inputs(1) .add_argument("data", "Tensor", "Input to which dropout will be applied.") .set_support_level(1) + .set_attr("TOpPattern", kOpaque) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .add_type_rel("Dropout", DropoutRel) .set_attr("TOpIsStateful", true); From 38aed59f9fdddcbc9ac98afb8aa11455c81fc9de Mon Sep 17 00:00:00 2001 From: CircleSpin <2keepconnected@gmail.com> Date: Thu, 18 Mar 2021 01:09:55 -0400 Subject: [PATCH 37/84] [TVMC] Allow optional arguments to be passed to importers (#7674) * add support for optional args for frontends tvmc * remove unnecessary comments * Add changes suggested by Matt W. via PR Co-authored-by: Jocelyn --- python/tvm/driver/tvmc/frontends.py | 27 +++++++++++----------- tests/python/driver/tvmc/test_frontends.py | 22 ++++++++++++------ 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py index 16e6c8eb966e4..0488223c782f7 100644 --- a/python/tvm/driver/tvmc/frontends.py +++ b/python/tvm/driver/tvmc/frontends.py @@ -54,7 +54,7 @@ def suffixes(): """File suffixes (extensions) used by this frontend""" @abstractmethod - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): """Load a model from a given path. Parameters @@ -101,7 +101,7 @@ def name(): def suffixes(): return ["h5"] - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0103 tf, keras = import_keras() @@ -130,7 +130,8 @@ def load(self, path, shape_dict=None): input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)} if shape_dict is not None: input_shapes.update(shape_dict) - return relay.frontend.from_keras(model, input_shapes, layout="NHWC") + kwargs.setdefault("layout", "NHWC") + return relay.frontend.from_keras(model, input_shapes, **kwargs) def is_sequential_p(self, model): _, keras = import_keras() @@ -158,14 +159,14 @@ def name(): def suffixes(): return ["onnx"] - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import onnx # pylint: disable=E1101 model = onnx.load(path) - return relay.frontend.from_onnx(model, shape=shape_dict) + return relay.frontend.from_onnx(model, shape=shape_dict, **kwargs) class TensorflowFrontend(Frontend): @@ -179,7 +180,7 @@ def name(): def suffixes(): return ["pb"] - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import tensorflow as tf import tvm.relay.testing.tf as tf_testing @@ -192,7 +193,7 @@ def load(self, path, shape_dict=None): graph_def = tf_testing.ProcessGraphDefParam(graph_def) logger.debug("parse TensorFlow model and convert into Relay computation graph") - return relay.frontend.from_tensorflow(graph_def, shape=shape_dict) + return relay.frontend.from_tensorflow(graph_def, shape=shape_dict, **kwargs) class TFLiteFrontend(Frontend): @@ -206,7 +207,7 @@ def name(): def suffixes(): return ["tflite"] - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import tflite.Model as model @@ -229,7 +230,7 @@ def load(self, path, shape_dict=None): raise TVMCException("input file not tflite version 3") logger.debug("parse TFLite model and convert into Relay computation graph") - mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict) + mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, **kwargs) return mod, params @@ -245,7 +246,7 @@ def suffixes(): # Torch Script is a zip file, but can be named pth return ["pth", "zip"] - def load(self, path, shape_dict=None): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import torch @@ -259,7 +260,7 @@ def load(self, path, shape_dict=None): input_shapes = list(shape_dict.items()) logger.debug("parse Torch model and convert into Relay computation graph") - return relay.frontend.from_pytorch(traced_model, input_shapes) + return relay.frontend.from_pytorch(traced_model, input_shapes, **kwargs) ALL_FRONTENDS = [ @@ -339,7 +340,7 @@ def guess_frontend(path): raise TVMCException("failed to infer the model format. Please specify --model-format") -def load_model(path, model_format=None, shape_dict=None): +def load_model(path, model_format=None, shape_dict=None, **kwargs): """Load a model from a supported framework and convert it into an equivalent relay representation. @@ -367,6 +368,6 @@ def load_model(path, model_format=None, shape_dict=None): else: frontend = guess_frontend(path) - mod, params = frontend.load(path, shape_dict) + mod, params = frontend.load(path, shape_dict, **kwargs) return mod, params diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index b41f4c4dff2d6..5a63c5c479332 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -115,26 +115,34 @@ def test_load_model__tflite(tflite_mobilenet_v1_1_quant): assert "_param_1" in params.keys() -def test_load_model__keras(keras_resnet50): +@pytest.mark.parametrize("load_model_kwargs", [{}, {"layout": "NCHW"}]) +def test_load_model__keras(keras_resnet50, load_model_kwargs): # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present pytest.importorskip("tensorflow") - mod, params = tvmc.frontends.load_model(keras_resnet50) + mod, params = tvmc.frontends.load_model(keras_resnet50, **load_model_kwargs) assert type(mod) is IRModule assert type(params) is dict ## check whether one known value is part of the params dict assert "_param_1" in params.keys() +def verify_load_model__onnx(model, **kwargs): + mod, params = tvmc.frontends.load_model(model, **kwargs) + assert type(mod) is IRModule + assert type(params) is dict + return mod, params + + def test_load_model__onnx(onnx_resnet50): # some CI environments wont offer onnx, so skip in case it is not present pytest.importorskip("onnx") - - mod, params = tvmc.frontends.load_model(onnx_resnet50) - assert type(mod) is IRModule - assert type(params) is dict - ## check whether one known value is part of the params dict + mod, params = verify_load_model__onnx(onnx_resnet50) + # check whether one known value is part of the params dict assert "resnetv24_batchnorm0_gamma" in params.keys() + mod, params = verify_load_model__onnx(onnx_resnet50, freeze_params=True) + # check that the parameter dict is empty, implying that they have been folded into constants + assert params == {} def test_load_model__pb(pb_mobilenet_v1_1_quant): From c976a07fa24efe91e8ac4f9d088a14442bb8d161 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 18 Mar 2021 06:08:08 -0700 Subject: [PATCH 38/84] [RUNTIME] Add libbacktrace for backtraces with line numbers (#7153) * [RUNTIME] Add libbacktrace for backtraces with line numbers Co-authored-by: Robert Kimball --- .gitmodules | 3 + 3rdparty/dmlc-core | 2 +- 3rdparty/libbacktrace | 1 + CMakeLists.txt | 62 ++- .../app/src/main/jni/Application.mk | 4 +- .../app/src/main/jni/tvm_runtime.h | 38 +- .../app/src/main/jni/Application.mk | 2 +- .../app/src/main/jni/tvm_runtime.h | 3 + .../app/src/main/jni/Application.mk | 2 +- .../app/src/main/jni/tvm_runtime.h | 36 +- apps/bundle_deploy/Makefile | 6 +- apps/dso_plugin_module/Makefile | 3 +- apps/extension/Makefile | 3 +- apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj | 6 + apps/ios_rpc/tvmrpc/TVMRuntime.h | 2 +- apps/ios_rpc/tvmrpc/TVMRuntime.mm | 16 +- apps/ios_rpc/tvmrpc/ViewController.mm | 4 +- cmake/config.cmake | 5 + cmake/modules/Libbacktrace.cmake | 45 ++ cmake/modules/VTA.cmake | 3 + golang/Makefile | 2 +- include/tvm/ir/attrs.h | 4 +- include/tvm/ir/diagnostic.h | 9 + include/tvm/ir/error.h | 26 +- include/tvm/ir/type_relation.h | 2 +- include/tvm/relay/analysis.h | 2 +- include/tvm/runtime/container.h | 1 + include/tvm/runtime/data_type.h | 2 +- include/tvm/runtime/logging.h | 438 ++++++++++++++++++ include/tvm/runtime/object.h | 2 +- include/tvm/runtime/packed_func.h | 6 +- include/tvm/runtime/vm/bytecode.h | 2 +- include/tvm/support/logging.h | 158 ------- include/tvm/support/with.h | 2 +- licenses/LICENSE.libbacktrace.txt | 29 ++ python/setup.py | 2 +- python/tvm/_ffi/base.py | 4 +- python/tvm/micro/build.py | 2 +- src/auto_scheduler/compute_dag.cc | 2 +- src/auto_scheduler/feature.cc | 2 +- .../search_policy/sketch_policy_rules.cc | 4 +- src/auto_scheduler/transform_step.cc | 2 +- src/ir/error.cc | 3 +- src/parser/parser.cc | 8 +- src/parser/span_check.h | 2 +- src/relay/analysis/annotated_region_set.cc | 5 +- src/relay/analysis/kind_check.cc | 2 +- src/relay/analysis/type_solver.cc | 4 +- src/relay/analysis/well_formed.cc | 2 +- src/relay/backend/vm/compiler.cc | 2 +- src/relay/backend/vm/compiler.h | 2 +- src/relay/backend/vm/inline_primitives.cc | 2 +- src/relay/backend/vm/lambda_lift.cc | 2 +- src/relay/backend/vm/removed_unused_funcs.cc | 2 +- src/relay/op/nn/convolution.h | 2 +- src/relay/op/tensor/transform.cc | 12 +- src/relay/op/tensor/transform.h | 4 +- src/relay/op/type_relations.cc | 2 +- src/relay/qnn/op/concatenate.cc | 14 +- src/relay/transforms/fold_explicit_padding.cc | 2 +- src/relay/transforms/inline.cc | 2 +- src/relay/transforms/memory_alloc.cc | 2 +- src/relay/transforms/partial_eval.cc | 4 +- src/relay/transforms/simplify_expr.cc | 2 +- src/relay/transforms/to_a_normal_form.cc | 2 +- .../transforms/to_basic_block_normal_form.cc | 2 +- src/relay/transforms/type_infer.cc | 2 +- src/runtime/c_runtime_api.cc | 6 +- src/runtime/contrib/cblas/cblas.cc | 2 +- src/runtime/contrib/cblas/mkl.cc | 2 +- src/runtime/contrib/cblas/mkldnn.cc | 2 +- src/runtime/contrib/cublas/cublas.cc | 2 +- src/runtime/contrib/cublas/cublas_utils.h | 2 +- src/runtime/contrib/cudnn/cudnn_utils.h | 2 +- src/runtime/contrib/miopen/miopen_utils.h | 2 +- src/runtime/contrib/mps/mps_utils.h | 2 +- src/runtime/contrib/nnpack/convolution.cc | 2 +- src/runtime/contrib/nnpack/fully_connected.cc | 2 +- src/runtime/contrib/nnpack/nnpack_utils.h | 2 +- .../contrib/random/mt_random_engine.cc | 2 +- src/runtime/contrib/random/random.cc | 2 +- src/runtime/contrib/rocblas/rocblas.cc | 2 +- .../contrib/tensorrt/tensorrt_logger.h | 2 +- .../contrib/vitis_ai/vitis_ai_runtime.cc | 1 + src/runtime/cpu_device_api.cc | 2 +- src/runtime/crt/Makefile | 4 +- src/runtime/crt/graph_runtime/load_json.c | 2 +- src/runtime/file_utils.cc | 2 +- src/runtime/graph/graph_runtime.cc | 2 +- src/runtime/hexagon/hexagon_device_api.cc | 2 +- src/runtime/hexagon/hexagon_module.cc | 2 +- src/runtime/hexagon/hexagon_module.h | 2 +- src/runtime/hexagon/sim/hexagon_device_sim.cc | 2 +- .../hexagon/target/hexagon_dsprpcapi.cc | 2 +- .../hexagon/target/hexagon_dsprpcapi.h | 2 +- src/runtime/hexagon/target/hexagon_stubapi.cc | 2 +- src/runtime/hexagon/target/hexagon_stubapi.h | 2 +- src/runtime/logging.cc | 151 ++++++ src/runtime/metal/metal_common.h | 2 +- src/runtime/micro/micro_session.cc | 2 +- src/runtime/minrpc/minrpc_server.h | 2 +- src/runtime/ndarray.cc | 2 +- src/runtime/object.cc | 2 +- src/runtime/opencl/opencl_common.h | 2 +- src/runtime/registry.cc | 2 +- src/runtime/rocm/rocm_device_api.cc | 2 +- src/runtime/rpc/rpc_device_api.cc | 4 +- src/runtime/rpc/rpc_endpoint.cc | 10 +- src/runtime/rpc/rpc_module.cc | 4 +- src/runtime/rpc/rpc_session.cc | 8 +- src/runtime/runtime_base.h | 6 +- src/runtime/thread_pool.cc | 2 +- src/runtime/threading_backend.cc | 2 +- src/runtime/vm/bytecode.cc | 2 +- src/runtime/vm/vm.cc | 2 +- src/runtime/vulkan/vulkan_common.h | 2 +- src/runtime/vulkan/vulkan_shader.h | 2 +- src/support/base64.h | 2 +- src/support/parallel_for.cc | 2 +- src/support/pipe.h | 2 +- src/support/socket.h | 2 +- src/target/llvm/llvm_common.cc | 2 +- src/target/target.cc | 81 ++-- tests/cpp/ir_functor_test.cc | 2 +- tests/cpp/parallel_for_test.cc | 2 +- tests/lint/check_file_type.py | 2 + tests/python/relay/test_ir_parser.py | 8 +- tests/python/relay/test_ir_text_printer.py | 5 - tests/scripts/task_build.sh | 2 +- tutorials/auto_scheduler/tune_network_cuda.py | 2 +- tutorials/auto_scheduler/tune_network_mali.py | 2 +- tutorials/auto_scheduler/tune_network_x86.py | 2 +- web/emcc/tvmjs_support.cc | 8 +- web/emcc/wasm_runtime.cc | 10 +- web/emcc/webgpu_runtime.cc | 25 +- 135 files changed, 1055 insertions(+), 419 deletions(-) create mode 160000 3rdparty/libbacktrace create mode 100644 cmake/modules/Libbacktrace.cmake create mode 100644 include/tvm/runtime/logging.h delete mode 100644 include/tvm/support/logging.h create mode 100644 licenses/LICENSE.libbacktrace.txt mode change 100755 => 100644 src/auto_scheduler/transform_step.cc create mode 100644 src/runtime/logging.cc diff --git a/.gitmodules b/.gitmodules index a1367c97b2f5a..6ef740e331532 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "3rdparty/vta-hw"] path = 3rdparty/vta-hw url = https://github.com/apache/incubator-tvm-vta +[submodule "3rdparty/libbacktrace"] + path = 3rdparty/libbacktrace + url = https://github.com/tlc-pack/libbacktrace.git diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 6c401e242c59a..21cc7de0dc9fd 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 6c401e242c59a1f4c913918246591bb13fd714e7 +Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41 diff --git a/3rdparty/libbacktrace b/3rdparty/libbacktrace new file mode 160000 index 0000000000000..08f7c7e69f8ea --- /dev/null +++ b/3rdparty/libbacktrace @@ -0,0 +1 @@ +Subproject commit 08f7c7e69f8ea61a0c4151359bc8023be8e9217b diff --git a/CMakeLists.txt b/CMakeLists.txt index 16968ce41f709..1aa3e68ffd14b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,11 @@ tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF) tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF) tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF) tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON) +set(_LIBBACKTRACE_DEFAULT OFF) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin" OR CMAKE_SYSTEM_NAME MATCHES "Linux") + set(_LIBBACKTRACE_DEFAULT ON) +endif() +tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" ${_LIBBACKTRACE_DEFAULT}) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -138,6 +143,8 @@ if(MSVC) add_compile_options(/wd4146) # 'inline': used more than once add_compile_options(/wd4141) + # unknown pragma + add_compile_options(/wd4068) else(MSVC) set(WARNING_FLAG -Wall) if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -389,6 +396,26 @@ set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}") add_library(tvm_runtime SHARED $) set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}") +target_compile_definitions(tvm_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm_runtime_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=) +if(USE_LIBBACKTRACE) + message(STATUS "Building with libbacktrace...") + include(cmake/modules/Libbacktrace.cmake) + target_link_libraries(tvm PRIVATE libbacktrace) + target_link_libraries(tvm_runtime PRIVATE libbacktrace) + add_dependencies(tvm_runtime_objs libbacktrace) + # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually + target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") +else() + target_compile_definitions(tvm_objs PRIVATE TVM_BACKTRACE_DISABLED) + target_compile_definitions(tvm_runtime_objs PRIVATE TVM_BACKTRACE_DISABLED) +endif() + if(USE_MICRO) # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the # Unix Makefiles generator, need to add these explicit target-level dependency) @@ -403,9 +430,9 @@ endif() if(USE_RELAY_DEBUG) message(STATUS "Building Relay in debug mode...") target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG") - target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG") + target_compile_definitions(tvm_objs PRIVATE "TVM_LOG_DEBUG") target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG") - target_compile_definitions(tvm_runtime_objs PRIVATE "DMLC_LOG_DEBUG") + target_compile_definitions(tvm_runtime_objs PRIVATE "TVM_LOG_DEBUG") else() target_compile_definitions(tvm_objs PRIVATE "NDEBUG") target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG") @@ -476,6 +503,7 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # once minimum CMake version is bumped up to 3.13 or above. target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) + target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=) endif() # Tests @@ -544,3 +572,33 @@ if(MSVC) target_compile_definitions(tvm_objs PRIVATE -DTVM_EXPORTS) target_compile_definitions(tvm_runtime_objs PRIVATE -DTVM_EXPORTS) endif() + +set(TVM_IS_DEBUG_BUILD OFF) +if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_CXX_FLAGS MATCHES "-g") + set(TVM_IS_DEBUG_BUILD ON) +endif() + +# Change relative paths in backtrace to absolute ones +if(TVM_IS_DEBUG_BUILD) + set(FILE_PREFIX_MAP_FLAG "-ffile-prefix-map=..=${CMAKE_CURRENT_SOURCE_DIR}") + target_compile_options(tvm PRIVATE "${FILE_PREFIX_MAP_FLAG}") + CHECK_CXX_COMPILER_FLAG("${FILE_PREFIX_MAP_FLAG}" FILE_PREFIX_MAP_SUPPORTED) + if(FILE_PREFIX_MAP_SUPPORTED) + target_compile_options(tvm PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_objs PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_runtime PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_runtime_objs PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + endif() +endif() + +# Run dsymutil to generate debugging symbols for backtraces +if(APPLE AND TVM_IS_DEBUG_BUILD) + find_program(DSYMUTIL dsymutil) + mark_as_advanced(DSYMUTIL) + add_custom_command(TARGET tvm + POST_BUILD + COMMAND ${DSYMUTIL} ARGS $ + COMMENT "Running dsymutil" + VERBATIM + ) +endif() diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk index 63a79458ef946..5c8774889685e 100644 --- a/apps/android_camera/app/src/main/jni/Application.mk +++ b/apps/android_camera/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= all APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif @@ -43,4 +43,4 @@ endif ifeq ($(USE_SORT), 1) APP_CPPFLAGS += -DUSE_SORT=1 -endif \ No newline at end of file +endif diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h index 5f3db04274a1a..47a3a3de6bba3 100644 --- a/apps/android_camera/app/src/main/jni/tvm_runtime.h +++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h @@ -25,17 +25,13 @@ #include -/* Enable custom logging - this will cause TVM to pass every log message - * through CustomLogMessage instead of LogMessage. By enabling this, we must - * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log - * messages to Android logcat. +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 +/* Enable custom logging - this will cause TVM to use a custom implementation + * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to + * Android logcat. */ -#define DMLC_LOG_CUSTOMIZE 1 - -/* Ensure that fatal errors are passed to the logger before throwing - * in LogMessageFatal - */ -#define DMLC_LOG_BEFORE_THROW 1 +#define TVM_LOG_CUSTOMIZE 1 #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" @@ -72,8 +68,20 @@ #include -void dmlc::CustomLogMessage::Log(const std::string& msg) { - // This is called for every message logged by TVM. - // We pass the message to logcat. - __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str()); -} \ No newline at end of file +namespace tvm { +namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); + throw InternalError(file, lineno, message); +} +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); +} + +} // namespace detail +} // namespace runtime +} // namespace tvm diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk index a50a40bf5cd19..42c4f232a5536 100644 --- a/apps/android_deploy/app/src/main/jni/Application.mk +++ b/apps/android_deploy/app/src/main/jni/Application.mk @@ -27,7 +27,7 @@ include $(config) APP_STL := c++_static -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h index 362d278c38c4d..4412e9c62e9d5 100644 --- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -25,6 +25,9 @@ #include +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 + #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" #include "../src/runtime/dso_library.cc" diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 5f885f1c6f144..088eeed750b89 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index fb5993066448f..40e6279fb386d 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -25,17 +25,13 @@ #include -/* Enable custom logging - this will cause TVM to pass every log message - * through CustomLogMessage instead of LogMessage. By enabling this, we must - * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log - * messages to Android logcat. +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 +/* Enable custom logging - this will cause TVM to use a custom implementation + * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to + * Android logcat. */ -#define DMLC_LOG_CUSTOMIZE 1 - -/* Ensure that fatal errors are passed to the logger before throwing - * in LogMessageFatal - */ -#define DMLC_LOG_BEFORE_THROW 1 +#define TVM_LOG_CUSTOMIZE 1 #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" @@ -81,8 +77,20 @@ #include -void dmlc::CustomLogMessage::Log(const std::string& msg) { - // This is called for every message logged by TVM. - // We pass the message to logcat. - __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str()); +namespace tvm { +namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); + throw InternalError(file, lineno, message); } +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); +} + +} // namespace detail +} // namespace runtime +} // namespace tvm diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile index 38d9d3456d552..8e23a92afa935 100644 --- a/apps/bundle_deploy/Makefile +++ b/apps/bundle_deploy/Makefile @@ -32,12 +32,14 @@ PKG_CXXFLAGS = ${PKG_COMPILE_OPTS} -std=c++14 \ -I${TVM_ROOT}/include \ -I${DMLC_CORE}/include \ -I${TVM_ROOT}/3rdparty/dlpack/include \ - -Icrt_config + -Icrt_config \ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \ -I${TVM_ROOT}/include \ -I${DMLC_CORE}/include \ -I${TVM_ROOT}/3rdparty/dlpack/include \ - -Icrt_config + -Icrt_config \ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS = -pthread -lm diff --git a/apps/dso_plugin_module/Makefile b/apps/dso_plugin_module/Makefile index c2ce3306870a6..438d9db223a81 100644 --- a/apps/dso_plugin_module/Makefile +++ b/apps/dso_plugin_module/Makefile @@ -19,7 +19,8 @@ TVM_ROOT=$(shell cd ../..; pwd) PKG_CFLAGS = -std=c++14 -O2 -fPIC\ -I${TVM_ROOT}/include\ -I${TVM_ROOT}/3rdparty/dmlc-core/include\ - -I${TVM_ROOT}/3rdparty/dlpack/include + -I${TVM_ROOT}/3rdparty/dlpack/include\ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS =-L${TVM_ROOT}/build UNAME_S := $(shell uname -s) diff --git a/apps/extension/Makefile b/apps/extension/Makefile index 91d914aba63b7..6eba941f7c98c 100644 --- a/apps/extension/Makefile +++ b/apps/extension/Makefile @@ -20,7 +20,8 @@ TVM_ROOT=$(shell cd ../..; pwd) PKG_CFLAGS = -std=c++14 -O2 -fPIC\ -I${TVM_ROOT}/include\ -I${TVM_ROOT}/3rdparty/dmlc-core/include\ - -I${TVM_ROOT}/3rdparty/dlpack/include + -I${TVM_ROOT}/3rdparty/dlpack/include\ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS =-L${TVM_ROOT}/build diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj index b33c892cf0028..28079e710a386 100644 --- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj +++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj @@ -349,6 +349,8 @@ GCC_PREPROCESSOR_DEFINITIONS = ( "DEBUG=1", "$(inherited)", + "DMLC_USE_LOGGING_LIBRARY=", + "TVM_BACKTRACE_DISABLED=1", ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; @@ -393,6 +395,10 @@ ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_NO_COMMON_BLOCKS = YES; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DMLC_USE_LOGGING_LIBRARY=", + "TVM_BACKTRACE_DISABLED=1", + ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_UNDECLARED_SELECTOR = YES; diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h index f6a6dc64c53a7..0d172fc3eaa11 100644 --- a/apps/ios_rpc/tvmrpc/TVMRuntime.h +++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h @@ -22,7 +22,7 @@ */ #import // Customize logging mechanism, redirect to NSLOG -#define DMLC_LOG_CUSTOMIZE 1 +#define TVM_LOG_CUSTOMIZE 1 #define TVM_METAL_RUNTIME 1 #include diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm index fbe4850e1b571..87cb6f9b4c691 100644 --- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm +++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm @@ -53,9 +53,19 @@ // CoreML #include "../../../src/runtime/contrib/coreml/coreml_runtime.mm" -namespace dmlc { +namespace tvm { +namespace runtime { +namespace detail { // Override logging mechanism -void CustomLogMessage::Log(const std::string& msg) { NSLog(@"%s", msg.c_str()); } +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + throw tvm::runtime::InternalError(file, lineno, message); +} + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + NSLog(@"%s:%d: %s", file.c_str(), lineno, message.c_str()); +} +} +} } // namespace dmlc namespace tvm { @@ -69,7 +79,7 @@ size_t Send(const void* data, size_t size) final { ssize_t nbytes = [stream_ write:reinterpret_cast(data) maxLength:size]; if (nbytes < 0) { NSLog(@"%@", [stream_ streamError].localizedDescription); - throw dmlc::Error("Stream error"); + throw tvm::Error("Stream error"); } return nbytes; } diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm index 910c650aedc1a..879ed2334a843 100644 --- a/apps/ios_rpc/tvmrpc/ViewController.mm +++ b/apps/ios_rpc/tvmrpc/ViewController.mm @@ -100,7 +100,7 @@ - (void)onReadAvailable { if (flag == 2) { [self onShutdownReceived]; } - } catch (const dmlc::Error& e) { + } catch (const tvm::Error& e) { [self close]; } } @@ -123,7 +123,7 @@ - (void)onWriteAvailable { if (flag == 2) { [self onShutdownReceived]; } - } catch (const dmlc::Error& e) { + } catch (const tvm::Error& e) { [self close]; } } diff --git a/cmake/config.cmake b/cmake/config.cmake index 60c718c97bc14..8c090dce741eb 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -275,3 +275,8 @@ set(USE_TARGET_ONNX OFF) # Whether enable BNNS runtime set(USE_BNNS OFF) + +# Whether to use libbacktrace +# Libbacktrace provides line and column information on stack traces from errors. It is only +# supported on linux and macOS. +# set(USE_LIBBACKTRACE OFF) diff --git a/cmake/modules/Libbacktrace.cmake b/cmake/modules/Libbacktrace.cmake new file mode 100644 index 0000000000000..7428553588093 --- /dev/null +++ b/cmake/modules/Libbacktrace.cmake @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +include(ExternalProject) + +ExternalProject_Add(project_libbacktrace + PREFIX libbacktrace + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace + CONFIGURE_COMMAND "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/configure" + "--prefix=${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" --with-pic + INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" + BUILD_COMMAND make + INSTALL_COMMAND make install + BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a" + "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include/backtrace.h" + ) + +# Custom step to rebuild libbacktrace if any of the source files change +file(GLOB LIBBACKTRACE_SRCS "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/*.c") +ExternalProject_Add_Step(project_libbacktrace checkout + DEPENDERS configure + DEPENDEES download + DEPENDS ${LIBBACKTRACE_SRCS} +) + +add_library(libbacktrace STATIC IMPORTED) +add_dependencies(libbacktrace project_libbacktrace) +set_property(TARGET libbacktrace + PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a) +# create include directory so cmake doesn't complain +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include) diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 115216680fff0..58b58d231d830 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -60,6 +60,7 @@ elseif(PYTHON) # Target lib: vta_fsim add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS}) target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta_fsim PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_fsim PUBLIC ${__strip_def}) @@ -81,6 +82,7 @@ elseif(PYTHON) # Target lib: vta_tsim add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS}) target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta_tsim PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_tsim PUBLIC ${__strip_def}) @@ -107,6 +109,7 @@ elseif(PYTHON) add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) target_include_directories(vta PUBLIC vta/runtime) target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta PUBLIC ${__strip_def}) diff --git a/golang/Makefile b/golang/Makefile index 6fd77996e119d..137e2a488e29d 100644 --- a/golang/Makefile +++ b/golang/Makefile @@ -25,7 +25,7 @@ NATIVE_SRC = tvm_runtime_pack.cc GOPATH=$(CURDIR)/gopath GOPATHDIR=${GOPATH}/src/${TARGET}/ CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/" -CGO_CXXFLAGS="-std=c++14" +CGO_CXXFLAGS="-std=c++14 -DDMLC_USE_LOGGING_LIBRARY=\" CGO_CFLAGS="-I${TVM_BASE}" CGO_LDFLAGS="-ldl -lm" diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h index f05ab04c3305b..da7bc12619bd4 100644 --- a/include/tvm/ir/attrs.h +++ b/include/tvm/ir/attrs.h @@ -92,12 +92,12 @@ inline DataType NullValue() { } /*! \brief Error thrown during attribute checking. */ -struct AttrError : public dmlc::Error { +struct AttrError : public Error { /*! * \brief constructor * \param msg error message */ - explicit AttrError(std::string msg) : dmlc::Error("AttributeError:" + msg) {} + explicit AttrError(std::string msg) : Error("AttributeError:" + msg) {} }; /*! diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h index 2053a295a3b8b..41130a5be0aa1 100644 --- a/include/tvm/ir/diagnostic.h +++ b/include/tvm/ir/diagnostic.h @@ -37,6 +37,15 @@ namespace tvm { using tvm::parser::SourceMap; using tvm::runtime::TypedPackedFunc; +/*! \brief The diagnostic level, controls the printing of the message. */ +enum class DiagnosticLevel : int { + kBug = 10, + kError = 20, + kWarning = 30, + kNote = 40, + kHelp = 50, +}; + class DiagnosticBuilder; /*! \brief A compiler diagnostic. */ diff --git a/include/tvm/ir/error.h b/include/tvm/ir/error.h index ac7b96a3bd597..6ff61781ac44f 100644 --- a/include/tvm/ir/error.h +++ b/include/tvm/ir/error.h @@ -36,11 +36,11 @@ namespace tvm { /*! * \brief A wrapper around std::stringstream to build error. * - * Can be consumed by Error to construct an error. + * Can be consumed by CompileError to construct an error. * * \code * - * void ReportError(const Error& err); + * void ReportError(const CompileError& err); * * void Test(int number) { * // Use error reporter to construct an error. @@ -59,13 +59,13 @@ struct ErrorBuilder { private: std::stringstream stream_; - friend class Error; + friend class CompileError; }; /*! * \brief Custom Error class to be thrown during compilation. */ -class Error : public dmlc::Error { +class CompileError : public Error { public: /*! \brief Location of the error */ Span span; @@ -73,20 +73,20 @@ class Error : public dmlc::Error { * \brief construct error from message. * \param msg The message */ - explicit Error(const std::string& msg) : dmlc::Error(msg), span(nullptr) {} + explicit CompileError(const std::string& msg) : Error(msg), span(nullptr) {} /*! * \brief construct error from error builder. * \param err The error builder */ - Error(const ErrorBuilder& err) : dmlc::Error(err.stream_.str()), span(nullptr) {} // NOLINT(*) + CompileError(const ErrorBuilder& err) : Error(err.stream_.str()), span(nullptr) {} // NOLINT(*) /*! * \brief copy constructor. * \param other The other ereor. */ - Error(const Error& other) : dmlc::Error(other.what()), span(other.span) {} // NOLINT(*) + CompileError(const CompileError& other) : Error(other.what()), span(other.span) {} // NOLINT(*) /*! * \brief default constructor. */ - Error() : dmlc::Error(""), span(nullptr) {} + CompileError() : Error(""), span(nullptr) {} }; /*! @@ -115,13 +115,13 @@ class ErrorReporter { ErrorReporter() : errors_(), node_to_error_() {} /*! - * \brief Report a tvm::Error. + * \brief Report a CompileError. * * This API is useful for reporting spanned errors. * * \param err The error to report. */ - void Report(const Error& err) { + void Report(const CompileError& err) { if (!err.span.defined()) { throw err; } @@ -143,7 +143,7 @@ class ErrorReporter { */ void ReportAt(const GlobalVar& global, const ObjectRef& node, std::stringstream& err) { std::string err_msg = err.str(); - this->ReportAt(global, node, Error(err_msg)); + this->ReportAt(global, node, CompileError(err_msg)); } /*! @@ -158,7 +158,7 @@ class ErrorReporter { * \param node The expression or type to report the error at. * \param err The error to report. */ - void ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err); + void ReportAt(const GlobalVar& global, const ObjectRef& node, const CompileError& err); /*! * \brief Render all reported errors and exit the program. @@ -176,7 +176,7 @@ class ErrorReporter { inline bool AnyErrors() { return errors_.size() != 0; } private: - std::vector errors_; + std::vector errors_; std::unordered_map, ObjectPtrHash, ObjectPtrEqual> node_to_error_; std::unordered_map node_to_gv_; }; diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h index 462588006c9ba..dd6861750a100 100644 --- a/include/tvm/ir/type_relation.h +++ b/include/tvm/ir/type_relation.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include namespace tvm { diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h index 5dd8370387319..f88b049940996 100644 --- a/include/tvm/relay/analysis.h +++ b/include/tvm/relay/analysis.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h index 336fef21ab88f..362582f4dab9a 100644 --- a/include/tvm/runtime/container.h +++ b/include/tvm/runtime/container.h @@ -30,6 +30,7 @@ #include #include +#include #include #include diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h index 7d914ce6bff91..b4fdcbff58b48 100644 --- a/include/tvm/runtime/data_type.h +++ b/include/tvm/runtime/data_type.h @@ -25,7 +25,7 @@ #define TVM_RUNTIME_DATA_TYPE_H_ #include -#include +#include #include #include diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h new file mode 100644 index 0000000000000..952a5ffec6376 --- /dev/null +++ b/include/tvm/runtime/logging.h @@ -0,0 +1,438 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/runtime/logging.h + * \brief logging utilities + * + * We define our own CHECK and LOG macros to replace those from dmlc-core. + * These macros are then injected into dmlc-core via the + * DMLC_USE_LOGGING_LIBRARY define. dmlc-core will #include this file wherever + * it needs logging. + */ +#ifndef TVM_RUNTIME_LOGGING_H_ +#define TVM_RUNTIME_LOGGING_H_ + +#include + +#include +#include +#include +#include +#include + +#include "tvm/runtime/c_runtime_api.h" + +// a technique that enables overriding macro names on the number of parameters. This is used +// to define other macros below +#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME + +/*! + * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X + * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG COND_X (but not COND_X_N) + * are supposed to be used outside this file. + * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert', + * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X. + * quit_on_assert determines the overall behavior of COND_X. If it's true COND_X + * quits the program on assertion failure. If it's false, then it moves on and somehow reports + * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false' + * in a function, or 'continue' or 'break' in a loop) + * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not + * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what + * to do when when quit_on_assertion is false and the assertion fails. + * + * Rationale: These macros were designed to implement functions that have two behaviors + * in a concise way. Those behaviors are quitting on assertion failures, or trying to + * move on from assertion failures. Note that these macros hide lots of control flow in them, + * and therefore, makes the logic of the whole code slightly harder to understand. However, + * in pieces of code that use these macros frequently, it will significantly shorten the + * amount of code needed to be read, and we won't need to clutter the main logic of the + * function by repetitive control flow structure. The first problem + * mentioned will be improved over time as the developer gets used to the macro. + * + * Here is an example of how to use it + * \code + * bool f(..., bool quit_on_assertion) { + * int a = 0, b = 0; + * ... + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default + * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting" + * ... + * for (int i = 0; i < N; i++) { + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default + * // behaviour, therefore, has to be explicitly specified) + * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting" + * } + * } + * \endcode + */ +#define COND_CHECK_GE(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__) +#define COND_CHECK_EQ(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__) +#define COND_CHECK(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__) +#define COND_LOG(...) \ + GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__) + +// Not supposed to be used by users directly. +#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \ + if (!quit_on_assert) { \ + if (!((x)op(y))) what; \ + } else /* NOLINT(*) */ \ + CHECK_##op(x, y) + +#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==) +#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=) + +#define COND_CHECK_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + if (!(x)) what; \ + } else /* NOLINT(*) */ \ + CHECK(x) + +#define COND_LOG_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + what; \ + } else /* NOLINT(*) */ \ + LOG(x) + +#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false) +#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false) +#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) +#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) + +#ifdef _MSC_VER +#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn) +#else +#define TVM_THROW_EXCEPTION noexcept(false) +#endif + +namespace tvm { +namespace runtime { + +/* \brief Generate a backtrace when called. + * \return A multiline string of the backtrace. There will be either one or two lines per frame. + */ +std::string Backtrace(); + +/*! \brief Base error type for TVM. Wraps a string message. */ +class Error : public ::dmlc::Error { // for backwards compatibility + public: + /*! \brief Construct an error. + * \param s The message to be displayed with the error. + */ + explicit Error(const std::string& s) : ::dmlc::Error(s) {} +}; + +/*! \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error + * contains a backtrace of where it occured. + */ +class InternalError : public Error { + public: + /*! \brief Construct an error. Not recommended to use directly. Instead use LOG(FATAL). + * + * \param file The file where the error occurred. + * \param lineno The line number where the error occurred. + * \param message The error message to display. + * \param time The time at which the error occurred. This should be in local time. + * \param backtrace Backtrace from when the error occurred. + */ + InternalError(std::string file, int lineno, std::string message, + std::time_t time = std::time(nullptr), std::string backtrace = Backtrace()) + : Error(""), + file_(file), + lineno_(lineno), + message_(message), + time_(time), + backtrace_(backtrace) { + std::ostringstream s; + // XXX: Do not change this format, otherwise all error handling in python will break (because it + // parses the message to reconstruct the error type). + // TODO(tkonolige): Convert errors to Objects, so we can avoid the mess of formatting/parsing + // error messages correctly. + s << "[" << std::put_time(std::localtime(&time), "%H:%M:%S") << "] " << file << ":" << lineno + << ": " << message << std::endl; + if (backtrace.size() > 0) { + s << backtrace << std::endl; + } + full_message_ = s.str(); + } + /*! \return The file in which the error occurred. */ + const std::string& file() const { return file_; } + /*! \return The message associated with this error. */ + const std::string& message() const { return message_; } + /*! \return Formatted error message including file, linenumber, backtrace, and message. */ + const std::string& full_message() const { return full_message_; } + /*! \return The backtrace from where this error occurred. */ + const std::string& backtrace() const { return backtrace_; } + /*! \return The time at which this error occurred. */ + const std::time_t& time() const { return time_; } + /*! \return The line number at which this error occurred. */ + int lineno() const { return lineno_; } + virtual const char* what() const noexcept { return full_message_.c_str(); } + + private: + std::string file_; + int lineno_; + std::string message_; + std::time_t time_; + std::string backtrace_; + std::string full_message_; // holds the full error string +}; + +namespace detail { +#ifndef TVM_LOG_CUSTOMIZE + +/*! \brief Class to accumulate an error message and throw it. Do not use + * directly, instead use LOG(FATAL). + */ +class LogFatal { + public: + LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} +#ifdef _MSC_VER +#pragma disagnostic push +#pragma warning(disable : 4722) +#endif + ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); } +#ifdef _MSC_VER +#pragma disagnostic pop +#endif + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; + std::string file_; + int lineno_; +}; + +/*! \brief Class to accumulate an log message. Do not use directly, instead use + * LOG(INFO), LOG(WARNING), LOG(ERROR). + */ +class LogMessage { + public: + LogMessage(const std::string& file, int lineno) { + std::time_t t = std::time(nullptr); + stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno + << ": "; + } + ~LogMessage() { std::cerr << stream_.str() << std::endl; } + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; +}; +#else +// Custom implementations of LogFatal and LogMessage that allow the user to +// override handling of the message. The user must implement LogFatalImpl and LogMessageImpl +void LogFatalImpl(const std::string& file, int lineno, const std::string& message); +class LogFatal { + public: + LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); } + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; + std::string file_; + int lineno_; +}; + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message); +class LogMessage { + public: + LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); } + std::ostringstream& stream() { return stream_; } + + private: + std::string file_; + int lineno_; + std::ostringstream stream_; +}; +#endif + +// Below is from dmlc-core +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { + public: + LogMessageVoidify() {} + // This has to be an operator with a precedence lower than << but + // higher than "?:". See its usage. + void operator&(std::ostream&) {} +}; + +// Also from dmlc-core +inline bool DebugLoggingEnabled() { + static int state = 0; + if (state == 0) { + if (auto var = std::getenv("TVM_LOG_DEBUG")) { + if (std::string(var) == "1") { + state = 1; + } else { + state = -1; + } + } else { + // by default hide debug logging. + state = -1; + } + } + return state == 1; +} + +constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE = + "---------------------------------------------------------------\n" + "An internal invariant was violated during the execution of TVM.\n" + "Please read TVM's error reporting guidelines.\n" + "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n" + "---------------------------------------------------------------\n"; + +// Inline _Pragma in macros does not work reliably on old version of MVSC and +// GCC. We wrap all comparisons in a function so that we can use #pragma to +// silence bad comparison warnings. +#define TVM_CHECK_FUNC(name, op) \ + template \ + DMLC_ALWAYS_INLINE bool LogCheck##name(const A& a, const B& b) { \ + return a op b; \ + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +TVM_CHECK_FUNC(_LT, <) +TVM_CHECK_FUNC(_GT, >) +TVM_CHECK_FUNC(_LE, <=) +TVM_CHECK_FUNC(_GE, >=) +TVM_CHECK_FUNC(_EQ, ==) +TVM_CHECK_FUNC(_NE, !=) +#pragma GCC diagnostic pop +} // namespace detail + +#define LOG(level) LOG_##level +#define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() +#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() +#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "error: ") +#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "warning: ") + +#define TVM_CHECK_BINARY_OP(name, op, x, y) \ + if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check failed: " << #x " " #op " " #y << ": " + +#define CHECK(x) \ + if (!(x)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check failed: " #x << " == false: " + +#define CHECK_LT(x, y) TVM_CHECK_BINARY_OP(_LT, <, x, y) +#define CHECK_GT(x, y) TVM_CHECK_BINARY_OP(_GT, >, x, y) +#define CHECK_LE(x, y) TVM_CHECK_BINARY_OP(_LE, <=, x, y) +#define CHECK_GE(x, y) TVM_CHECK_BINARY_OP(_GE, >=, x, y) +#define CHECK_EQ(x, y) TVM_CHECK_BINARY_OP(_EQ, ==, x, y) +#define CHECK_NE(x, y) TVM_CHECK_BINARY_OP(_NE, !=, x, y) +#define CHECK_NOTNULL(x) \ + ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check not null: " #x << ' ', \ + (x) : (x)) // NOLINT(*) + +#define LOG_IF(severity, condition) \ + !(condition) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) + +#if TVM_LOG_DEBUG + +#define LOG_DFATAL LOG_FATAL +#define DFATAL FATAL +#define DLOG(severity) LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled()) +#define DLOG_IF(severity, condition) \ + LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled() && (condition)) + +#else + +#define LOG_DFATAL LOG_ERROR +#define DFATAL ERROR +#define DLOG(severity) true ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) +#define DLOG_IF(severity, condition) \ + (true || !(condition)) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) + +#endif + +#if TVM_LOG_DEBUG +#define DCHECK(x) \ + while (false) CHECK(x) +#define DCHECK_LT(x, y) \ + while (false) CHECK((x) < (y)) +#define DCHECK_GT(x, y) \ + while (false) CHECK((x) > (y)) +#define DCHECK_LE(x, y) \ + while (false) CHECK((x) <= (y)) +#define DCHECK_GE(x, y) \ + while (false) CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) \ + while (false) CHECK((x) == (y)) +#define DCHECK_NE(x, y) \ + while (false) CHECK((x) != (y)) +#else +#define DCHECK(x) CHECK(x) +#define DCHECK_LT(x, y) CHECK((x) < (y)) +#define DCHECK_GT(x, y) CHECK((x) > (y)) +#define DCHECK_LE(x, y) CHECK((x) <= (y)) +#define DCHECK_GE(x, y) CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) CHECK((x) == (y)) +#define DCHECK_NE(x, y) CHECK((x) != (y)) +#endif + +#define TVM_ICHECK_INDENT " " + +#define ICHECK_BINARY_OP(name, op, x, y) \ + if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ + << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << ": " + +#define ICHECK(x) \ + if (!(x)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << TVM_ICHECK_INDENT \ + << "Check failed: " #x << " == false: " + +#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y) +#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y) +#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y) +#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y) +#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y) +#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y) +#define ICHECK_NOTNULL(x) \ + ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE \ + << TVM_ICHECK_INDENT << "Check not null: " #x << ' ', \ + (x) : (x)) // NOLINT(*) + +} // namespace runtime +// Re-export error types +using runtime::Error; +using runtime::InternalError; +} // namespace tvm +#endif // TVM_RUNTIME_LOGGING_H_ diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h index 47788394126ed..048fc1d5af544 100644 --- a/include/tvm/runtime/object.h +++ b/include/tvm/runtime/object.h @@ -24,7 +24,7 @@ #define TVM_RUNTIME_OBJECT_H_ #include -#include +#include #include #include diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 751a435c734aa..7113863a6fb35 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -24,10 +24,10 @@ #ifndef TVM_RUNTIME_PACKED_FUNC_H_ #define TVM_RUNTIME_PACKED_FUNC_H_ -#include #include #include #include +#include #include #include #include @@ -1086,7 +1086,7 @@ struct PackedFuncValueConverter { Function(::tvm::runtime::TVMArgs(args, type_code, num_args), &rv); \ rv.MoveToCHost(out_value, out_type_code); \ return 0; \ - } catch (const ::std::runtime_error& _except_) { \ + } catch (const ::std::exception& _except_) { \ TVMAPISetLastError(_except_.what()); \ return -1; \ } \ @@ -1140,7 +1140,7 @@ struct PackedFuncValueConverter { f, ::tvm::runtime::TVMArgs(args, type_code, num_args), &rv); \ rv.MoveToCHost(out_value, out_type_code); \ return 0; \ - } catch (const ::std::runtime_error& _except_) { \ + } catch (const ::std::exception& _except_) { \ TVMAPISetLastError(_except_.what()); \ return -1; \ } \ diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h index e858c4458054c..72a557fa93b1e 100644 --- a/include/tvm/runtime/vm/bytecode.h +++ b/include/tvm/runtime/vm/bytecode.h @@ -25,7 +25,7 @@ #define TVM_RUNTIME_VM_BYTECODE_H_ #include -#include +#include #include #include diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h deleted file mode 100644 index ced1902a1bd1d..0000000000000 --- a/include/tvm/support/logging.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file tvm/support/logging.h - * \brief logging utilities on top of dmlc-core - */ -#ifndef TVM_SUPPORT_LOGGING_H_ -#define TVM_SUPPORT_LOGGING_H_ - -#include - -// a technique that enables overriding macro names on the number of parameters. This is used -// to define other macros below -#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME - -/*! - * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X - * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.) - * COND_X (but not COND_X_N) are supposed to be used outside this file. - * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert', - * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X. - * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X - * quits the program on assertion failure. If it's false, then it moves on and somehow reports - * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false' - * in a function, or 'continue' or 'break' in a loop) - * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not - * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what - * to do when when quit_on_assertion is false and the assertion fails. - * - * Rationale: These macros were designed to implement functions that have two behaviours - * in a concise way. Those behaviours are quitting on assertion failures, or trying to - * move on from assertion failures. Note that these macros hide lots of control flow in them, - * and therefore, makes the logic of the whole code slightly harder to understand. However, - * in pieces of code that use these macros frequently, it will significantly shorten the - * amount of code needed to be read, and we won't need to clutter the main logic of the - * function by repetitive control flow structure. The first problem - * mentioned will be improved over time as the developer gets used to the macro. - * - * Here is an example of how to use it - * \code - * bool f(..., bool quit_on_assertion) { - * int a = 0, b = 0; - * ... - * a = ... - * b = ... - * // if quit_on_assertion is true, if a==b, continue, otherwise quit. - * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default - * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting" - * ... - * for (int i = 0; i < N; i++) { - * a = ... - * b = ... - * // if quit_on_assertion is true, if a==b, continue, otherwise quit. - * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default - * // behaviour, therefore, has to be explicitly specified) - * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting" - * } - * } - * \endcode - */ -#define COND_CHECK_GE(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__) -#define COND_CHECK_EQ(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__) -#define COND_CHECK(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__) -#define COND_LOG(...) \ - GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__) - -// Not supposed to be used by users directly. -#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \ - if (!quit_on_assert) { \ - if (!((x)op(y))) what; \ - } else /* NOLINT(*) */ \ - CHECK_##op(x, y) - -#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==) -#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=) - -#define COND_CHECK_3(quit_on_assert, x, what) \ - if (!quit_on_assert) { \ - if (!(x)) what; \ - } else /* NOLINT(*) */ \ - CHECK(x) - -#define COND_LOG_3(quit_on_assert, x, what) \ - if (!quit_on_assert) { \ - what; \ - } else /* NOLINT(*) */ \ - LOG(x) - -#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false) -#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false) -#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) -#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) - -namespace tvm { - -constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE = - "\n---------------------------------------------------------------\n" - "An internal invariant was violated during the execution of TVM.\n" - "Please read TVM's error reporting guidelines.\n" - "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n" - "---------------------------------------------------------------\n"; - -#define ICHECK_INDENT " " - -#define ICHECK_BINARY_OP(name, op, x, y) \ - if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \ - dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ - << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": " - -#define ICHECK(x) \ - if (!(x)) \ - dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: " - -#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y) -#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y) -#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y) -#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y) -#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y) -#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y) -#define ICHECK_NOTNULL(x) \ - ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT \ - << "Check not null: " #x << ' ', \ - (x) : (x)) // NOLINT(*) - -/*! \brief The diagnostic level, controls the printing of the message. */ -enum class DiagnosticLevel : int { - kBug = 10, - kError = 20, - kWarning = 30, - kNote = 40, - kHelp = 50, -}; - -} // namespace tvm -#endif // TVM_SUPPORT_LOGGING_H_ diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h index 90c82c4f3a06b..d4547a304e8f2 100644 --- a/include/tvm/support/with.h +++ b/include/tvm/support/with.h @@ -25,7 +25,7 @@ #ifndef TVM_SUPPORT_WITH_H_ #define TVM_SUPPORT_WITH_H_ -#include +#include #include diff --git a/licenses/LICENSE.libbacktrace.txt b/licenses/LICENSE.libbacktrace.txt new file mode 100644 index 0000000000000..097d2774e5dfb --- /dev/null +++ b/licenses/LICENSE.libbacktrace.txt @@ -0,0 +1,29 @@ +# Copyright (C) 2012-2016 Free Software Foundation, Inc. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# (1) Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# (2) Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. + +# (3) The name of the author may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. diff --git a/python/setup.py b/python/setup.py index e02369e977778..b47e5b14f6a7a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -94,7 +94,7 @@ def config_cython(): subdir = "_cy2" ret = [] path = "tvm/_ffi/_cython" - extra_compile_args = ["-std=c++14"] + extra_compile_args = ["-std=c++14", "-DDMLC_USE_LOGGING_LIBRARY="] if os.name == "nt": library_dirs = ["tvm", "../build/Release", "../build"] libraries = ["tvm"] diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py index 397090618ade7..0496195fd73fa 100644 --- a/python/tvm/_ffi/base.py +++ b/python/tvm/_ffi/base.py @@ -253,7 +253,9 @@ def c2pyerror(err_msg): message = [] for line in arr: if trace_mode: - if line.startswith(" "): + if line.startswith(" "): + stack_trace[-1] += "\n" + line + elif line.startswith(" "): stack_trace.append(line) else: trace_mode = False diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py index 3837d423f8bd9..d95f14f0349e7 100644 --- a/python/tvm/micro/build.py +++ b/python/tvm/micro/build.py @@ -118,7 +118,7 @@ def get_runtime_libs() -> str: RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE) -_COMMON_CFLAGS = ["-Wall", "-Werror"] +_COMMON_CFLAGS = ["-Wall", "-Werror", "-DDMLC_USE_LOGGING_LIBRARY="] def _build_default_compiler_options(standalone_crt_dir: typing.Optional[str] = None) -> str: diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc index 4e7fb05660a4d..abbcba234848d 100644 --- a/src/auto_scheduler/compute_dag.cc +++ b/src/auto_scheduler/compute_dag.cc @@ -1367,7 +1367,7 @@ Array ComputeDAG::InferBound(const Array& states) const { support::parallel_for(0, states.size(), [this, &states, &out_states](int i) { try { out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]); - } catch (dmlc::Error& e) { + } catch (Error& e) { LOG(WARNING) << "InferBound fails on the state:\n" << states[i] << "\n" << "with: " << e.what() << std::endl; diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc index d93218c0208c1..b3c62f01c7c8d 100755 --- a/src/auto_scheduler/feature.cc +++ b/src/auto_scheduler/feature.cc @@ -1328,7 +1328,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i const auto& prim_func = (*it).second.as(); GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs, feature); - } catch (dmlc::Error& e) { + } catch (Error& e) { (*error_ct)++; } } diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc index 110be6bd6f68c..8eaf803214563 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc @@ -1106,7 +1106,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo } try { StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag); - } catch (dmlc::Error& e) { + } catch (Error& e) { return ResultKind::kInvalid; } } @@ -1228,7 +1228,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol tmp_s.CopyOnWrite()->transform_steps.push_back(step); try { StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag); - } catch (dmlc::Error& e) { + } catch (Error& e) { return ResultKind::kInvalid; } } diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc old mode 100755 new mode 100644 index 5ba3eee070981..b67d5cdd7bd93 --- a/src/auto_scheduler/transform_step.cc +++ b/src/auto_scheduler/transform_step.cc @@ -26,8 +26,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/src/ir/error.cc b/src/ir/error.cc index 5d3978dda4ffe..0089f55a4da85 100644 --- a/src/ir/error.cc +++ b/src/ir/error.cc @@ -132,7 +132,8 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) { LOG(FATAL) << annotated_prog.str() << std::endl; } -void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err) { +void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, + const CompileError& err) { size_t index_to_insert = this->errors_.size(); this->errors_.push_back(err); auto it = this->node_to_error_.find(node); diff --git a/src/parser/parser.cc b/src/parser/parser.cc index 3061735eff7ce..c7d8e025848a3 100644 --- a/src/parser/parser.cc +++ b/src/parser/parser.cc @@ -28,9 +28,9 @@ #include #include #include +#include #include #include -#include #include @@ -172,8 +172,8 @@ class ScopeStack { void PopStack() { this->scope_stack.pop_back(); } }; -struct DuplicateKeyError : public dmlc::Error { - explicit DuplicateKeyError(const std::string& msg) : dmlc::Error(msg) {} +struct DuplicateKeyError : public Error { + explicit DuplicateKeyError(const std::string& msg) : Error(msg) {} }; /*! \brief A table of interning strings as global function and type names. */ @@ -1492,7 +1492,7 @@ class Parser { DLOG(INFO) << "op_name=" << op_name << " span=" << span; try { return Op::Get(op_name); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // we can relax this, but probably need to relax checks or return non-null here. this->diag_ctx.EmitFatal(Diagnostic::Error(span) << "operator `" << op_name diff --git a/src/parser/span_check.h b/src/parser/span_check.h index 9a887474fe670..ab71d30a54f52 100644 --- a/src/parser/span_check.h +++ b/src/parser/span_check.h @@ -30,8 +30,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc index 04a18c4b7351a..85a9c51a2fa8b 100644 --- a/src/relay/analysis/annotated_region_set.cc +++ b/src/relay/analysis/annotated_region_set.cc @@ -157,8 +157,9 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor { // Check if the argument already belongs to a region auto region = region_set_->GetRegion(call->args[0]); if (!region.defined()) { - throw Error(ErrorBuilder() << "Cannot find the corresponding region for end annotation:\n" - << AsText(GetRef(call), false)); + throw CompileError(ErrorBuilder() + << "Cannot find the corresponding region for end annotation:\n" + << AsText(GetRef(call), false)); } else { // If the argument is belonged to a region, it must have the same target. // Otherwise we should see a region_begin op. diff --git a/src/relay/analysis/kind_check.cc b/src/relay/analysis/kind_check.cc index c7c5a0a9f0832..65b8516cb16c7 100644 --- a/src/relay/analysis/kind_check.cc +++ b/src/relay/analysis/kind_check.cc @@ -139,7 +139,7 @@ struct KindChecker : TypeFunctor { << "Expected " << data->type_vars.size() << "arguments for " << tc << "; got " << op->args.size()); } - } catch (const dmlc::Error& err) { + } catch (const Error& err) { // TODO(@jroesch): can probably relax to just emit EmitFatal(Diagnostic::Error(op->span) << "the type variable : `" << var->name_hint << "` is undefined"); diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc index cc1ada677c653..22e2e9a710405 100644 --- a/src/relay/analysis/type_solver.cc +++ b/src/relay/analysis/type_solver.cc @@ -617,10 +617,10 @@ bool TypeSolver::Solve() { } rnode->resolved = resolved; - } catch (const Error& err) { + } catch (const CompileError& err) { this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what()); rnode->resolved = false; - } catch (const dmlc::Error& e) { + } catch (const Error& e) { ICHECK(false) << e.what(); } diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc index 856c5dc7aac1d..acc1a9adc9f46 100644 --- a/src/relay/analysis/well_formed.cc +++ b/src/relay/analysis/well_formed.cc @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 251a55f10b725..9d3ffc558aaeb 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -33,8 +33,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h index 615a8181b3875..9c813a4f561c8 100644 --- a/src/relay/backend/vm/compiler.h +++ b/src/relay/backend/vm/compiler.h @@ -29,8 +29,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc index eb848eb7a8285..05fb2a1206208 100644 --- a/src/relay/backend/vm/inline_primitives.cc +++ b/src/relay/backend/vm/inline_primitives.cc @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc index cc530a10188ea..c768a2c300ec1 100644 --- a/src/relay/backend/vm/lambda_lift.cc +++ b/src/relay/backend/vm/lambda_lift.cc @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/backend/vm/removed_unused_funcs.cc b/src/relay/backend/vm/removed_unused_funcs.cc index cdf898fca756e..5e9b1b7978f97 100644 --- a/src/relay/backend/vm/removed_unused_funcs.cc +++ b/src/relay/backend/vm/removed_unused_funcs.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h index 2a49a2e251f8a..379fa3fa71d3c 100644 --- a/src/relay/op/nn/convolution.h +++ b/src/relay/op/nn/convolution.h @@ -25,7 +25,7 @@ #define TVM_RELAY_OP_NN_CONVOLUTION_H_ #include -#include +#include #include #include diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index e3929bf8b77ee..b65068bd05066 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -312,7 +312,7 @@ bool StackRel(const Array& types, int num_inputs, const Attrs& attrs, if (first->shape[j].as() || e->shape[j].as() || reporter->AssertEQ(first->shape[j], e->shape[j])) continue; - throw Error( + throw CompileError( "relay.stack requires all tensors have the same shape " "on non-stacking axes"); } @@ -483,7 +483,7 @@ Array> TransposeInferCorrectLayout(const Attrs& attrs, } try { return Array>({{Layout(in_layout_str)}, {Layout(out_layout_str)}}); - } catch (const dmlc::Error& e) { + } catch (const tvm::Error& e) { // If the layout string is invalid for any reason, give up. return Array>({{Layout::Undef()}, {Layout::Undef()}}); } @@ -1691,8 +1691,8 @@ bool MeshgridRel(const Array& types, int num_inputs, const Attrs& raw_attr const MeshgridAttrs* attrs = raw_attrs.as(); const auto* tensor_tuple = types[0].as(); if (tensor_tuple == nullptr) { - throw Error( - ErrorBuilder() << "meshgrid requires a tuple of tensors as the first argument, found " + throw CompileError(ErrorBuilder() + << "meshgrid requires a tuple of tensors as the first argument, found " << PrettyPrint(types[0])); } else if (types[0].as() != nullptr) { return false; @@ -1714,14 +1714,14 @@ bool MeshgridRel(const Array& types, int num_inputs, const Attrs& raw_attr int e_ndim = static_cast(e->shape.size()); const DataType& e_dtype = e->dtype; if (e_dtype != dtype) { - throw Error("relay.meshgrid requires all tensors have the same dtype"); + throw CompileError("relay.meshgrid requires all tensors have the same dtype"); } if (e_ndim == 0) { grid_shape.emplace_back(1); } else if (e_ndim == 1) { grid_shape.emplace_back(e->shape[0]); } else { - throw Error("relay.meshgrid requires all tensors be either scalars or 1-D vectors."); + throw CompileError("relay.meshgrid requires all tensors be either scalars or 1-D vectors."); } } diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h index dbf8537e0dad9..3c670bcaaa51b 100644 --- a/src/relay/op/tensor/transform.h +++ b/src/relay/op/tensor/transform.h @@ -78,8 +78,8 @@ bool ConcatenateRel(const Array& types, int num_inputs, const Attrs& attrs // Sanity check: axis int axis = param->axis; if (!(-ndim <= axis && axis < ndim)) { - throw Error(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)" - << ", but got axis = " << axis << ", and ndim = " << ndim); + throw CompileError(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)" + << ", but got axis = " << axis << ", and ndim = " << ndim); } axis = axis < 0 ? ndim + axis : axis; diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc index 7b30aea2eb57b..6e30ad9624c42 100644 --- a/src/relay/op/type_relations.cc +++ b/src/relay/op/type_relations.cc @@ -85,7 +85,7 @@ TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataTyp } else if (EqualCheck(s1, s2)) { oshape.push_back(s1); } else { - throw Error(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2); + throw CompileError(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2); } } diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc index 59a519d664362..eb0f83836a54a 100644 --- a/src/relay/qnn/op/concatenate.cc +++ b/src/relay/qnn/op/concatenate.cc @@ -51,9 +51,10 @@ bool QnnConcatenateRel(const Array& types, int num_inputs, const Attrs& at if (types[1].as()) { return false; } else { - throw Error(ErrorBuilder() - << "qnn concatenate requires a tuple of scales as the second argument, found " - << PrettyPrint(types[1])); + throw CompileError( + ErrorBuilder() + << "qnn concatenate requires a tuple of scales as the second argument, found " + << PrettyPrint(types[1])); } } for (const auto& input_scale : input_scales_tuple->fields) { @@ -68,9 +69,10 @@ bool QnnConcatenateRel(const Array& types, int num_inputs, const Attrs& at if (types[2].as()) { return false; } else { - throw Error(ErrorBuilder() - << "qnn concatenate requires a tuple of zero_points as the third argument, found " - << PrettyPrint(types[2])); + throw CompileError( + ErrorBuilder() + << "qnn concatenate requires a tuple of zero_points as the third argument, found " + << PrettyPrint(types[2])); } } for (const auto& input_zero_point : input_zero_points_tuple->fields) { diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc index bab8b814df052..d959e5b75e40d 100644 --- a/src/relay/transforms/fold_explicit_padding.cc +++ b/src/relay/transforms/fold_explicit_padding.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../op/tensor/transform.h" #include "pattern_utils.h" diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc index dae34674de77d..6e6505b28dc67 100644 --- a/src/relay/transforms/inline.cc +++ b/src/relay/transforms/inline.cc @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc index b8c87909a0255..f75b7ba1fc750 100644 --- a/src/relay/transforms/memory_alloc.cc +++ b/src/relay/transforms/memory_alloc.cc @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc index fa080a7ff22c6..3a87aa8ed4988 100644 --- a/src/relay/transforms/partial_eval.cc +++ b/src/relay/transforms/partial_eval.cc @@ -861,8 +861,8 @@ class PartialEvaluator : public ExprFunctor return VisitFunc(GetRef(op), ll); } - struct ReflectError : dmlc::Error { - ReflectError() : dmlc::Error("static value not found") {} + struct ReflectError : Error { + ReflectError() : Error("static value not found") {} }; Expr Reflect(const PStatic& st) { diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index 3c8876ceccb5f..b4f4cc16e9df4 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../op/tensor/transform.h" #include "pattern_utils.h" diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc index 05844477cc5b4..91e8d90c1232f 100644 --- a/src/relay/transforms/to_a_normal_form.cc +++ b/src/relay/transforms/to_a_normal_form.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../../support/arena.h" #include "../analysis/dependency_graph.h" diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc index 1aab367cf22ae..79157bba19180 100644 --- a/src/relay/transforms/to_basic_block_normal_form.cc +++ b/src/relay/transforms/to_basic_block_normal_form.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../../support/arena.h" #include "../analysis/dependency_graph.h" diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc index b4ccd16598656..4c60137924260 100644 --- a/src/relay/transforms/type_infer.cc +++ b/src/relay/transforms/type_infer.cc @@ -166,7 +166,7 @@ class TypeInferencer : private ExprFunctor, bool assign_rhs = true) { try { return solver_.Unify(t1, t2, span, assign_lhs, assign_rhs); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { this->EmitFatal(Diagnostic::Error(span) << "Error unifying `" << t1 << "` and `" << t2 << "`: " << e.what()); return Type(); diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 7fd27cba61367..150d7f215da5a 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -384,7 +384,7 @@ typedef dmlc::ThreadLocalStore TVMAPIRuntimeStore; const char* TVMGetLastError() { return TVMAPIRuntimeStore::Get()->last_error.c_str(); } -int TVMAPIHandleException(const std::runtime_error& e) { +int TVMAPIHandleException(const std::exception& e) { TVMAPISetLastError(NormalizeError(e.what()).c_str()); return -1; } @@ -518,7 +518,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, resource_handle); if (ret != 0) { - throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); + throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace()); } }); } else { @@ -529,7 +529,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, rpack.get()); if (ret != 0) { - throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); + throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace()); } }); } diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc index 16496e06aae34..fbac6222488d2 100644 --- a/src/runtime/contrib/cblas/cblas.cc +++ b/src/runtime/contrib/cblas/cblas.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc index 273aa45367dd0..4323878db2761 100644 --- a/src/runtime/contrib/cblas/mkl.cc +++ b/src/runtime/contrib/cblas/mkl.cc @@ -21,8 +21,8 @@ * \file Use external mkl library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc index 1c3fa023dcc78..31abd317c6a47 100644 --- a/src/runtime/contrib/cblas/mkldnn.cc +++ b/src/runtime/contrib/cblas/mkldnn.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc index b12992f571596..9af1602cf3c09 100644 --- a/src/runtime/contrib/cublas/cublas.cc +++ b/src/runtime/contrib/cublas/cublas.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include #include "../cblas/gemm_common.h" #include "cublas_utils.h" diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h index 32c3b03ddbb0a..3edb8300be886 100644 --- a/src/runtime/contrib/cublas/cublas_utils.h +++ b/src/runtime/contrib/cublas/cublas_utils.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #if CUDART_VERSION >= 10010 diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h index 528298b75187c..9b8e9fb33f985 100644 --- a/src/runtime/contrib/cudnn/cudnn_utils.h +++ b/src/runtime/contrib/cudnn/cudnn_utils.h @@ -26,7 +26,7 @@ #include #include -#include +#include #include "../../cuda/cuda_common.h" diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h index 9982f0914f6bc..e5a769a974f02 100644 --- a/src/runtime/contrib/miopen/miopen_utils.h +++ b/src/runtime/contrib/miopen/miopen_utils.h @@ -26,7 +26,7 @@ #include #include -#include +#include #include diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h index d1c49732318aa..c2b7e3c7aa99d 100644 --- a/src/runtime/contrib/mps/mps_utils.h +++ b/src/runtime/contrib/mps/mps_utils.h @@ -28,8 +28,8 @@ #include #include #include +#include #include -#include #include diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc index b3ea6c891d432..0d63594959026 100644 --- a/src/runtime/contrib/nnpack/convolution.cc +++ b/src/runtime/contrib/nnpack/convolution.cc @@ -23,8 +23,8 @@ #include #include #include +#include #include -#include #include "nnpack_utils.h" diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc index 8b72eb38e08c3..28570026ada3e 100644 --- a/src/runtime/contrib/nnpack/fully_connected.cc +++ b/src/runtime/contrib/nnpack/fully_connected.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include "nnpack_utils.h" diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h index 231309baaa8eb..4396ea0bcde69 100644 --- a/src/runtime/contrib/nnpack/nnpack_utils.h +++ b/src/runtime/contrib/nnpack/nnpack_utils.h @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include namespace tvm { namespace contrib { diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc index 49bc056dcafb7..699f6bbcf3762 100644 --- a/src/runtime/contrib/random/mt_random_engine.cc +++ b/src/runtime/contrib/random/mt_random_engine.cc @@ -22,8 +22,8 @@ * \brief mt19937 random engine */ #include +#include #include -#include #include #include diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc index edcd20883369e..2d111bc322ab7 100644 --- a/src/runtime/contrib/random/random.cc +++ b/src/runtime/contrib/random/random.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc index dca1ebc6ed832..d977b1a211b07 100644 --- a/src/runtime/contrib/rocblas/rocblas.cc +++ b/src/runtime/contrib/rocblas/rocblas.cc @@ -23,8 +23,8 @@ #include "rocblas.h" #include +#include #include -#include namespace tvm { namespace contrib { diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h index 087cb010189c9..eb0164210dbbe 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_logger.h +++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h @@ -25,7 +25,7 @@ #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_ #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_ -#include +#include #include "NvInfer.h" #include "tensorrt_utils.h" diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc index 37dc767d31af0..0e5e2ce4c4fa7 100755 --- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc +++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc @@ -25,6 +25,7 @@ #include +#include #include #include #include diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index b745be33b4563..133bb01d7d137 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include #include diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile index 0f3e3096e3197..d707d0c63b810 100644 --- a/src/runtime/crt/Makefile +++ b/src/runtime/crt/Makefile @@ -45,8 +45,8 @@ QUIET ?= @ CRT_PREFIX = $(wildcard src/crt) INCLUDES ?= -isystem include -iquote $(dir ${CRT_CONFIG}) -CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) +CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\ +CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\ LDFLAGS += -Werror -g $(EXTRA_LDFLAGS) ${BUILD_DIR}/%.o: src/%.c $(CRT_CONFIG) diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_runtime/load_json.c index 6de49a3f97890..3d1fb601a355c 100644 --- a/src/runtime/crt/graph_runtime/load_json.c +++ b/src/runtime/crt/graph_runtime/load_json.c @@ -173,7 +173,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) { * \param out_str the output string. NULL to merely consume input and discard it. * \param out_str_size Number of bytes available to write starting from out_str. Includes * terminating \0. - * \throw dmlc::Error when next token is not string + * \throw tvm::Error when next token is not string */ int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) { int status = 0; diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc index 92c398b559d2c..32dd1d8020c97 100644 --- a/src/runtime/file_utils.cc +++ b/src/runtime/file_utils.cc @@ -24,9 +24,9 @@ #include #include +#include #include #include -#include #include #include diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 6c51e711aef13..7e98acb6fb3ee 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -491,7 +491,7 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name, } else if (name == "share_params") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { const auto& module = args[0].operator Module(); - ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime"); + ICHECK_EQ(module.operator->()->type_key(), std::string("GraphRuntime")); const auto& param_blob = args[1].operator std::string(); dmlc::MemoryStringStream strm(const_cast(¶m_blob)); this->ShareParams(dynamic_cast(*module.operator->()), &strm); diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 70cebf5afa449..a01c9def5d5de 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -18,8 +18,8 @@ */ #include +#include #include -#include #include #include diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc index 994e24b99084a..f6a57ff55355b 100644 --- a/src/runtime/hexagon/hexagon_module.cc +++ b/src/runtime/hexagon/hexagon_module.cc @@ -22,8 +22,8 @@ #ifdef __ANDROID__ #include #endif +#include #include -#include #include #include diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h index e558997b7a4c8..02ed7d2541c25 100644 --- a/src/runtime/hexagon/hexagon_module.h +++ b/src/runtime/hexagon/hexagon_module.h @@ -20,8 +20,8 @@ #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_ +#include #include -#include #include #include diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc index 6cc7dcf3209f6..1d3f0fd1006f0 100644 --- a/src/runtime/hexagon/sim/hexagon_device_sim.cc +++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc index d494db82e2c72..a089684c4188b 100644 --- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc +++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc @@ -22,7 +22,7 @@ #include #include -#include +#include #include "hexagon_target_log.h" diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h index c0e40805ecbf2..e4711e3da5843 100644 --- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h +++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h @@ -22,7 +22,7 @@ #ifdef __ANDROID__ #include -#include +#include #include "remote.h" #include "remote64.h" diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc index 5428ae7c1cfff..1fb7d942e9686 100644 --- a/src/runtime/hexagon/target/hexagon_stubapi.cc +++ b/src/runtime/hexagon/target/hexagon_stubapi.cc @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "hexagon_target_log.h" diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h index cc5b7b7413cab..fba22b10247c9 100644 --- a/src/runtime/hexagon/target/hexagon_stubapi.h +++ b/src/runtime/hexagon/target/hexagon_stubapi.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc new file mode 100644 index 0000000000000..8a44ec04532c1 --- /dev/null +++ b/src/runtime/logging.cc @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef TVM_BACKTRACE_DISABLED +#include + +// TODO(bkimball,tkonolige) This inline function is to work around a linking error I am having when +// using MSVC If the function definition is in logging.cc then the linker can't find it no matter +// what kind of attributes (dllexport) I decorate it with. This is temporary and will be addressed +// when we get backtrace working on Windows. +namespace tvm { +namespace runtime { +__declspec(dllexport) std::string Backtrace() { return ""; } +} // namespace runtime +} // namespace tvm +#else + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace { + +struct BacktraceInfo { + std::vector lines; + size_t max_size; + std::string error_message; +}; + +void BacktraceCreateErrorCallback(void* data, const char* msg, int errnum) { + std::cerr << "Could not initialize backtrace state: " << msg << std::endl; +} + +backtrace_state* BacktraceCreate() { + return backtrace_create_state(nullptr, 1, BacktraceCreateErrorCallback, nullptr); +} + +static backtrace_state* _bt_state = BacktraceCreate(); + +std::string DemangleName(std::string name) { + int status = 0; + size_t length = name.size(); + std::unique_ptr demangled_name = { + abi::__cxa_demangle(name.c_str(), nullptr, &length, &status), &std::free}; + if (demangled_name && status == 0 && length > 0) { + return demangled_name.get(); + } else { + return name; + } +} + +void BacktraceErrorCallback(void* data, const char* msg, int errnum) { + // do nothing +} + +void BacktraceSyminfoCallback(void* data, uintptr_t pc, const char* symname, uintptr_t symval, + uintptr_t symsize) { + auto str = reinterpret_cast(data); + + if (symname != nullptr) { + std::string tmp(symname, symsize); + *str = DemangleName(tmp.c_str()); + } else { + std::ostringstream s; + s << "0x" << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << std::hex << pc; + *str = s.str(); + } +} + +int BacktraceFullCallback(void* data, uintptr_t pc, const char* filename, int lineno, + const char* symbol) { + auto stack_trace = reinterpret_cast(data); + std::stringstream s; + + std::unique_ptr symbol_str = std::make_unique(""); + if (symbol != nullptr) { + *symbol_str = DemangleName(symbol); + } else { + // see if syminfo gives anything + backtrace_syminfo(_bt_state, pc, BacktraceSyminfoCallback, BacktraceErrorCallback, + symbol_str.get()); + } + s << *symbol_str; + + if (filename != nullptr) { + s << std::endl << " at " << filename; + if (lineno != 0) { + s << ":" << lineno; + } + } + // Skip tvm::backtrace and tvm::LogFatal::~LogFatal at the beginning of the trace as they don't + // add anything useful to the backtrace. + if (!(stack_trace->lines.size() == 0 && + (symbol_str->find("tvm::runtime::Backtrace", 0) == 0 || + symbol_str->find("tvm::runtime::detail::LogFatal", 0) == 0))) { + stack_trace->lines.push_back(s.str()); + } + // TVMFuncCall denotes the API boundary so we stop there. Exceptions should be caught there. + if (*symbol_str == "TVMFuncCall" || stack_trace->lines.size() >= stack_trace->max_size) { + return 1; + } + return 0; +} +} // namespace + +std::string Backtrace() { + BacktraceInfo bt; + bt.max_size = 100; + if (_bt_state == nullptr) { + return ""; + } + // libbacktrace eats memory if run on multiple threads at the same time, so we guard against it + static std::mutex m; + std::lock_guard lock(m); + backtrace_full(_bt_state, 0, BacktraceFullCallback, BacktraceErrorCallback, &bt); + + std::ostringstream s; + s << "Stack trace:\n"; + for (size_t i = 0; i < bt.lines.size(); i++) { + s << " " << i << ": " << bt.lines[i] << "\n"; + } + + return s.str(); +} +} // namespace runtime +} // namespace tvm +#endif diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index bd07dbfde9d09..b5d06192396b2 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -32,8 +32,8 @@ #import #include #include +#include #include -#include #include #include diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 6c0d0c4c40fe5..cd916d46971d4 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -25,8 +25,8 @@ #include #include +#include #include -#include #include #include diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h index d5c61eccfd6db..3b9772f2fb607 100644 --- a/src/runtime/minrpc/minrpc_server.h +++ b/src/runtime/minrpc/minrpc_server.h @@ -46,7 +46,7 @@ #endif #if TVM_MINRPC_ENABLE_LOGGING -#include +#include #endif namespace tvm { diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index d3ddbf8c0229f..d46f0868a2ead 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -23,9 +23,9 @@ */ #include #include +#include #include #include -#include #include "runtime_base.h" diff --git a/src/runtime/object.cc b/src/runtime/object.cc index ad68c70698ea5..c9a9669671e62 100644 --- a/src/runtime/object.cc +++ b/src/runtime/object.cc @@ -20,9 +20,9 @@ * \file src/runtime/object.cc * \brief Object type management system. */ +#include #include #include -#include #include #include diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 2e7f05f910203..3fca368c758b1 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -26,8 +26,8 @@ #include #include +#include #include -#include /* There are many OpenCL platforms that do not yet support OpenCL 2.0, * hence we use 1.2 APIs, some of which are now deprecated. In order diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc index a65235090bfdc..bb5a794a030bd 100644 --- a/src/runtime/registry.cc +++ b/src/runtime/registry.cc @@ -22,8 +22,8 @@ * \brief The global registry of packed function. */ #include +#include #include -#include #include #include diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 5f24ce0eec484..5d03374a4571e 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -25,9 +25,9 @@ #include #include #include +#include #include #include -#include #include "rocm_common.h" diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index 06737f99a4de1..cdeeb368f5a27 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -21,8 +21,8 @@ * \file rpc_device_api.cc */ #include +#include #include -#include #include @@ -72,7 +72,7 @@ class RPCDeviceAPI final : public DeviceAPI { auto remote_ctx = RemoveRPCSessionMask(ctx); try { GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close. } delete space; diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc index 8716355fd68f2..5e2bba88921e8 100644 --- a/src/runtime/rpc/rpc_endpoint.cc +++ b/src/runtime/rpc/rpc_endpoint.cc @@ -526,7 +526,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { try { fconstructor->CallPacked(constructor_args, &con_ret); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "Server[" << name_ << "]:" << " Error caught from session constructor " << constructor_name << ":\n" << e.what(); @@ -540,7 +540,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule"; serving_session_ = RPCModuleGetSession(mod); this->ReturnVoid(); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); } @@ -562,7 +562,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { } this->SwitchToState(kRecvPacketNumBytes); }); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); this->SwitchToState(kRecvPacketNumBytes); } @@ -581,7 +581,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { setter(0, rv); this->ReturnPackedSeq(TVMArgs(&ret_value, &ret_tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); } this->SwitchToState(kRecvPacketNumBytes); @@ -719,7 +719,7 @@ void RPCEndpoint::Shutdown() { writer_.bytes_available()); if (n == 0) break; } - } catch (const dmlc::Error& e) { + } catch (const Error& e) { } channel_.reset(nullptr); } diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 34691415c1a47..46e1be7945204 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -130,7 +130,7 @@ class RPCWrappedFunc : public Object { ~RPCWrappedFunc() { try { sess_->FreeHandle(handle_, kTVMPackedFuncHandle); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close } } @@ -165,7 +165,7 @@ class RPCModuleNode final : public ModuleNode { if (module_handle_ != nullptr) { try { sess_->FreeHandle(module_handle_, kTVMModuleHandle); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close } module_handle_ = nullptr; diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 0ac5b8dc74ef6..2b75018099d5f 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -46,7 +46,7 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values try { this->CallFunc(func, arg_values, arg_type_codes, num_args, [&callback](TVMArgs args) { callback(RPCCode::kReturn, args); }); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } @@ -60,7 +60,7 @@ void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, try { this->CopyToRemote(local_from_bytes, remote_to, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } @@ -74,7 +74,7 @@ void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes try { this->CopyFromRemote(remote_from, local_to_bytes, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } @@ -88,7 +88,7 @@ void RPCSession::AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, try { this->GetDeviceAPI(ctx)->StreamSync(ctx, stream); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } diff --git a/src/runtime/runtime_base.h b/src/runtime/runtime_base.h index 21601df1ad395..7abb32935a2bb 100644 --- a/src/runtime/runtime_base.h +++ b/src/runtime/runtime_base.h @@ -34,7 +34,7 @@ and finishes with API_END() or API_END_HANDLE_ERROR */ #define API_END() \ } \ - catch (std::runtime_error & _except_) { \ + catch (std::exception & _except_) { \ return TVMAPIHandleException(_except_); \ } \ return 0; // NOLINT(*) @@ -45,7 +45,7 @@ */ #define API_END_HANDLE_ERROR(Finalize) \ } \ - catch (std::runtime_error & _except_) { \ + catch (std::exception & _except_) { \ Finalize; \ return TVMAPIHandleException(_except_); \ } \ @@ -56,6 +56,6 @@ * \param e the exception * \return the return value of API after exception is handled */ -int TVMAPIHandleException(const std::runtime_error& e); +int TVMAPIHandleException(const std::exception& e); #endif // TVM_RUNTIME_RUNTIME_BASE_H_ diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index 5f5a811c2d30e..cab04ec0db4ac 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -24,10 +24,10 @@ #include #include #include +#include #include #include #include -#include #if TVM_THREADPOOL_USE_OPENMP #include #endif diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 2527f47990860..7f9cfaa8730c2 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -21,8 +21,8 @@ * \file threading_backend.cc * \brief Native threading backend */ +#include #include -#include #include #include diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc index f82d708468f71..09b928fa1e392 100644 --- a/src/runtime/vm/bytecode.cc +++ b/src/runtime/vm/bytecode.cc @@ -22,8 +22,8 @@ * \brief The bytecode for Relay virtual machine. */ +#include #include -#include #include diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 6d121aa677335..4683398b01d45 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -24,10 +24,10 @@ #include #include +#include #include #include #include -#include #include #include diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index 9cd1f257f0911..3083ba6f9ce4d 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -22,8 +22,8 @@ #include #include +#include #include -#include #include #include diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h index c9fbb13e938d5..513e3bccc36e6 100644 --- a/src/runtime/vulkan/vulkan_shader.h +++ b/src/runtime/vulkan/vulkan_shader.h @@ -22,8 +22,8 @@ #include #include +#include #include -#include #include diff --git a/src/support/base64.h b/src/support/base64.h index 901922db8edc3..3aac9920a0754 100644 --- a/src/support/base64.h +++ b/src/support/base64.h @@ -26,7 +26,7 @@ #ifndef TVM_SUPPORT_BASE64_H_ #define TVM_SUPPORT_BASE64_H_ -#include +#include #include #include diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc index f4756c29adebd..4ced0df6ddf3d 100644 --- a/src/support/parallel_for.cc +++ b/src/support/parallel_for.cc @@ -21,7 +21,7 @@ * \file parallel_for.cc * \brief An implementation to run loop in parallel. */ -#include +#include #include #include diff --git a/src/support/pipe.h b/src/support/pipe.h index 3c1356ba174c8..a2803638e1f32 100644 --- a/src/support/pipe.h +++ b/src/support/pipe.h @@ -25,7 +25,7 @@ #define TVM_SUPPORT_PIPE_H_ #include -#include +#include #ifdef _WIN32 #include diff --git a/src/support/socket.h b/src/support/socket.h index 16fba6b58e3d9..11060ae8aae15 100644 --- a/src/support/socket.h +++ b/src/support/socket.h @@ -49,7 +49,7 @@ using ssize_t = int; #include #include #endif -#include +#include #include #include diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc index 35bfc8dc2e5b2..61dd7024ff055 100644 --- a/src/target/llvm/llvm_common.cc +++ b/src/target/llvm/llvm_common.cc @@ -24,7 +24,7 @@ #include "llvm_common.h" -#include +#include #include #include diff --git a/src/target/target.cc b/src/target/target.cc index b5ca4c38bbb92..55ef5f1a4e24f 100644 --- a/src/target/target.cc +++ b/src/target/target.cc @@ -79,7 +79,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte std::ostringstream os; os << ": Expects type \"" << expected_type << "\", but gets \"" << obj->GetTypeKey() << "\" for object: " << obj; - throw dmlc::Error(os.str()); + throw Error(os.str()); } return ptr; } @@ -87,7 +87,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte static TargetKind GetTargetKind(const String& name) { Optional kind = TargetKind::Get(name); if (!kind.defined()) { - throw dmlc::Error(": Target kind \"" + name + "\" is not defined"); + throw Error(": Target kind \"" + name + "\" is not defined"); } return kind.value(); } @@ -98,10 +98,10 @@ static std::string RemovePrefixDashes(const std::string& s) { for (; n_dashes < len && s[n_dashes] == '-'; ++n_dashes) { } if (n_dashes == 0) { - throw dmlc::Error(": Attribute keys should start with '-', not an attribute key: " + s); + throw Error(": Attribute keys should start with '-', not an attribute key: " + s); } if (n_dashes >= len) { - throw dmlc::Error(": Not an attribute key: " + s); + throw Error(": Not an attribute key: " + s); } return s.substr(n_dashes); } @@ -133,7 +133,7 @@ static int ParseKVPair(const std::string& s, const std::string& s_next, std::str result_k = s.substr(0, pos); result_v = s.substr(pos + 1); if (result_k.empty() || result_v.empty()) { - throw dmlc::Error(": Empty attribute key or value in \"" + s + "\""); + throw Error(": Empty attribute key or value in \"" + s + "\""); } return 1; } else if (!s_next.empty() && s_next[0] != '-') { @@ -163,7 +163,7 @@ const TargetKindNode::ValueTypeInfo& TargetInternal::FindTypeInfo(const TargetKi } os << kv.first; } - throw dmlc::Error(os.str()); + throw Error(os.str()); } return it->second; } @@ -177,14 +177,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str, // Parsing integer int v; if (!(is >> v)) { - throw dmlc::Error(": Cannot parse into type \"Integer\" from string: " + str); + throw Error(": Cannot parse into type \"Integer\" from string: " + str); } return Integer(v); } else if (info.type_index == String::ContainerType::_GetOrAllocRuntimeTypeIndex()) { // Parsing string std::string v; if (!(is >> v)) { - throw dmlc::Error(": Cannot parse into type \"String\" from string: " + str); + throw Error(": Cannot parse into type \"String\" from string: " + str); } return String(v); } else if (info.type_index == Target::ContainerType::_GetOrAllocRuntimeTypeIndex()) { @@ -197,14 +197,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str, try { ObjectRef parsed = TargetInternal::ParseType(substr, *info.key); result.push_back(parsed); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::string index = "[" + std::to_string(result.size()) + "]"; - throw dmlc::Error(index + e.what()); + throw Error(index + e.what()); } } return Array(result); } - throw dmlc::Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str); + throw Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str); } ObjectRef TargetInternal::ParseType(const ObjectRef& obj, @@ -224,15 +224,14 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, } else if (const auto* ptr = obj.as()) { for (const auto& kv : *ptr) { if (!kv.first->IsInstance()) { - throw dmlc::Error(": Target object requires key of dict to be str, but get: " + - kv.first->GetTypeKey()); + throw Error(": Target object requires key of dict to be str, but get: " + + kv.first->GetTypeKey()); } } Map config = GetRef>(ptr); return Target(TargetInternal::FromConfig({config.begin(), config.end()})); } - throw dmlc::Error(": Expect type 'dict' or 'str' to construct Target, but get: " + - obj->GetTypeKey()); + throw Error(": Expect type 'dict' or 'str' to construct Target, but get: " + obj->GetTypeKey()); } else if (info.type_index == ArrayNode::_GetOrAllocRuntimeTypeIndex()) { // Parsing array const auto* array = ObjTypeCheck(obj, "Array"); @@ -240,9 +239,9 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, for (const ObjectRef& e : *array) { try { result.push_back(TargetInternal::ParseType(e, *info.key)); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::string index = '[' + std::to_string(result.size()) + ']'; - throw dmlc::Error(index + e.what()); + throw Error(index + e.what()); } } return Array(result); @@ -254,17 +253,17 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, ObjectRef key, val; try { key = TargetInternal::ParseType(kv.first, *info.key); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::ostringstream os; os << "'s key \"" << key << "\"" << e.what(); - throw dmlc::Error(os.str()); + throw Error(os.str()); } try { val = TargetInternal::ParseType(kv.second, *info.val); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::ostringstream os; os << "[\"" << key << "\"]" << e.what(); - throw dmlc::Error(os.str()); + throw Error(os.str()); } result[key] = val; } @@ -275,7 +274,7 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, os << ": Parsing type \"" << info.type_key << "\" is not supported for the given object of type \"" << obj->GetTypeKey() << "\". The object is: " << obj; - throw dmlc::Error(os.str()); + throw Error(os.str()); } return obj; } @@ -355,7 +354,7 @@ Target::Target(const String& tag_or_config_or_target_str) { ObjectPtr target; try { target = TargetInternal::FromString(tag_or_config_or_target_str); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "ValueError" << e.what() << ". Target creation from string failed: " << tag_or_config_or_target_str; } @@ -366,7 +365,7 @@ Target::Target(const Map& config) { ObjectPtr target; try { target = TargetInternal::FromConfig({config.begin(), config.end()}); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "ValueError" << e.what() << ". Target creation from config dict failed: " << config; } @@ -496,7 +495,7 @@ ObjectPtr TargetInternal::FromConfigString(const String& config_str) { "if the python module is properly loaded"; Optional> config = (*loader)(config_str); if (!config.defined()) { - throw dmlc::Error(": Cannot load config dict with python JSON loader"); + throw Error(": Cannot load config dict with python JSON loader"); } return TargetInternal::FromConfig({config.value().begin(), config.value().end()}); } @@ -514,7 +513,7 @@ ObjectPtr TargetInternal::FromRawString(const String& target_str) { } } if (name.empty()) { - throw dmlc::Error(": Cannot parse empty target string"); + throw Error(": Cannot parse empty target string"); } // Create the target config std::unordered_map config = {{"kind", String(name)}}; @@ -525,17 +524,17 @@ ObjectPtr TargetInternal::FromRawString(const String& target_str) { // Parse key-value pair std::string s_next = (iter + 1 < options.size()) ? options[iter + 1] : ""; iter += ParseKVPair(RemovePrefixDashes(options[iter]), s_next, &key, &value); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target" + std::string(e.what())); + } catch (const Error& e) { + throw Error(": Error when parsing target" + std::string(e.what())); } try { // check if `key` has been used if (config.count(key)) { - throw dmlc::Error(": The key \"" + key + "\" appears more than once"); + throw Error(": The key \"" + key + "\" appears more than once"); } config[key] = TargetInternal::ParseType(value, TargetInternal::FindTypeInfo(kind, key)); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what()); + } catch (const Error& e) { + throw Error(": Error when parsing target[\"" + key + "\"]" + e.what()); } } return TargetInternal::FromConfig(config); @@ -554,11 +553,11 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_mapkind = GetTargetKind(GetRef(kind)); config.erase(kKind); } else { - throw dmlc::Error(": Expect type of field \"kind\" is String, but get type: " + - config[kKind]->GetTypeKey()); + throw Error(": Expect type of field \"kind\" is String, but get type: " + + config[kKind]->GetTypeKey()); } } else { - throw dmlc::Error(": Field \"kind\" is not found"); + throw Error(": Field \"kind\" is not found"); } // parse "tag" if (config.count(kTag)) { @@ -566,8 +565,8 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_maptag = GetRef(tag); config.erase(kTag); } else { - throw dmlc::Error(": Expect type of field \"tag\" is String, but get type: " + - config[kTag]->GetTypeKey()); + throw Error(": Expect type of field \"tag\" is String, but get type: " + + config[kTag]->GetTypeKey()); } } else { target->tag = ""; @@ -582,15 +581,15 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_map()) { keys.push_back(GetRef(key)); } else { - throw dmlc::Error( + throw Error( ": Expect 'keys' to be an array of strings, but it " "contains an element of type: " + e->GetTypeKey()); } } } else { - throw dmlc::Error(": Expect type of field \"keys\" is Array, but get type: " + - config[kKeys]->GetTypeKey()); + throw Error(": Expect type of field \"keys\" is Array, but get type: " + + config[kKeys]->GetTypeKey()); } } // add device name @@ -615,8 +614,8 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_mapkind, key); attrs[key] = TargetInternal::ParseType(value, info); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what()); + } catch (const Error& e) { + throw Error(": Error when parsing target[\"" + key + "\"]" + e.what()); } } // parse host diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc index 1f7d18f747eac..9e8595d6809ce 100644 --- a/tests/cpp/ir_functor_test.cc +++ b/tests/cpp/ir_functor_test.cc @@ -125,7 +125,7 @@ TEST(IRF, ExprTransform) { try { f(z - 1, 2); LOG(FATAL) << "should fail"; - } catch (dmlc::Error&) { + } catch (Error&) { } } diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc index bf5fe94b83ffe..a4549344bd11f 100644 --- a/tests/cpp/parallel_for_test.cc +++ b/tests/cpp/parallel_for_test.cc @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index ab51b6c79c83b..f5c0de0a50b02 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -131,6 +131,8 @@ # microTVM Virtual Machines "apps/microtvm/reference-vm/zephyr/Vagrantfile", "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template", + # patch file for libbacktrace + "cmake/modules/libbacktrace_macos.patch", } diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py index 62e52abefeb4a..8b6b39e3df157 100644 --- a/tests/python/relay/test_ir_parser.py +++ b/tests/python/relay/test_ir_parser.py @@ -827,8 +827,8 @@ def test_import_grad(): mod.import_from_std("gradient.rly") -def test_resnet(): - mod, _ = relay.testing.resnet.get_workload() +def test_mlp(): + mod, _ = relay.testing.mlp.get_workload(1) text = mod.astext() parsed_mod = tvm.parser.parse(text) tvm.ir.assert_structural_equal(mod, parsed_mod) @@ -850,8 +850,8 @@ def inline_params(mod, params): return mod -def test_resnet_inlined_params(): - mod, params = relay.testing.resnet.get_workload() +def test_mlp_inlined_params(): + mod, params = relay.testing.mlp.get_workload(1) mod = inline_params(mod, params) mod = relay.transform.InferType()(mod) text = mod.astext() diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py index 72a243dbbb67a..b2ae28649e6a4 100644 --- a/tests/python/relay/test_ir_text_printer.py +++ b/tests/python/relay/test_ir_text_printer.py @@ -181,11 +181,6 @@ def test_squeezenet(): astext(net) -def test_vgg(): - net, _ = tvm.relay.testing.vgg.get_workload(batch_size=1) - astext(net) - - def test_densenet(): net, _ = tvm.relay.testing.densenet.get_workload(batch_size=1) astext(net) diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh index d8e35ebd4de3b..845b7153ae200 100755 --- a/tests/scripts/task_build.sh +++ b/tests/scripts/task_build.sh @@ -16,4 +16,4 @@ # specific language governing permissions and limitations # under the License. export VTA_HW_PATH=`pwd`/3rdparty/vta-hw -cd $1 && cmake .. && make $2 && cd .. +cd $1 && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo && make $2 && cd .. diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index 5ed3ceef5ba0c..bc88457f94f90 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -252,7 +252,7 @@ def run_tuning(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s and CUDA errors, because the +# There will also be some "tvm::Error"s and CUDA errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index ca1067b27c807..2bce968771e32 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -329,7 +329,7 @@ def tune_and_evaluate(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s errors, because the +# There will also be some "tvm::Error"s errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index 8526abbbe6caa..2b47c64729e0f 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -251,7 +251,7 @@ def run_tuning(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s errors, because the +# There will also be some "tvm::Error"s errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index b72caad1e3dfa..12f930f491a59 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -25,11 +25,9 @@ */ // configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 #include #include diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index 214c1883f8742..0b14ef6476d20 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -23,14 +23,12 @@ */ // configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 -#include #include +#include #include "src/runtime/c_runtime_api.cc" #include "src/runtime/cpu_device_api.cc" diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index 62b87af01774e..01e42ef3faa85 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -22,12 +22,10 @@ * \brief WebGPU runtime based on the TVM JS. */ -// configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +// configurations for tvm logging. +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 #include #include @@ -35,12 +33,27 @@ #include #include +#include +#include + #include "../../src/runtime/meta_data.h" #include "../../src/runtime/vulkan/vulkan_shader.h" #include "../../src/runtime/workspace_pool.h" namespace tvm { namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << file << ":" << lineno << ": " << message << std::endl; + abort(); +} + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << file << ":" << lineno << ": " << message << std::endl; +} + +} // namespace detail /*! \brief Thread local workspace */ class WebGPUThreadEntry { From 45442edcf4d1e9d3e8a37b4530d8f634a6f0cc05 Mon Sep 17 00:00:00 2001 From: Altan Haan Date: Thu, 18 Mar 2021 07:04:48 -0700 Subject: [PATCH 39/84] [Relay][Training][Pass] Factor out first-order AD to a module pass (#7677) --- python/tvm/relay/transform/transform.py | 26 +- src/relay/transforms/first_order_gradient.cc | 309 ++++++++++++++++++ src/relay/transforms/gradient.h | 54 +++ .../{gradient.cc => higher_order_gradient.cc} | 274 +--------------- 4 files changed, 391 insertions(+), 272 deletions(-) create mode 100644 src/relay/transforms/first_order_gradient.cc create mode 100644 src/relay/transforms/gradient.h rename src/relay/transforms/{gradient.cc => higher_order_gradient.cc} (64%) diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index b61f209505d86..5b0e480f5f287 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -800,12 +800,36 @@ def gradient(expr, mod=None, mode="higher_order"): The transformed expression. """ if mode == "first_order": - return _ffi_api.first_order_gradient(expr, mod) + warnings.warn( + "using transform.gradient for first-order AD is deprecated, please use the" + "FirstOrderGradient module pass", + DeprecationWarning, + ) + if mod is not None: + raise RuntimeError( + "to run first-order AD on a module, please use the FirstOrderGradient module pass." + ) + return FirstOrderGradient()(tvm.IRModule.from_expr(expr))["main"] if mode == "higher_order": return _ffi_api.gradient(expr, mod) raise Exception("unknown mode") +def FirstOrderGradient(): + """ + Transforms all global functions in the module to return the original result, paired with the + gradients of the inputs. This pass transforms each global function independently and does not + support interprocedural AD. Additionally, this pass does not support any control-flow or + references, and should only be used on pure data-flow graphs. + + Returns + ------- + ret : tvm.transform.Pass + The registered FirstOrderGradient pass. + """ + return _ffi_api.FirstOrderGradient() + + def Defunctionalization(func, mod): """ Performs defunctionalization on func, diff --git a/src/relay/transforms/first_order_gradient.cc b/src/relay/transforms/first_order_gradient.cc new file mode 100644 index 0000000000000..55714592ded75 --- /dev/null +++ b/src/relay/transforms/first_order_gradient.cc @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file first_order_gradient.cc + * \brief First-order Automatic Differentiation in Relay for pure dataflow graphs. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "gradient.h" +#include "let_list.h" +#include "pass_utils.h" +#include "pattern_utils.h" + +namespace tvm { +namespace relay { + +template +Expr MultiFactory(const Type& t, F factory, DiagnosticContext diag_ctx) { + if (auto* tt = t.as()) { + return factory(tt->shape, tt->dtype); + } else if (auto* tt = t.as()) { + std::vector res; + for (size_t i = 0; i < tt->fields.size(); i++) { + res.push_back(MultiFactory(tt->fields[i], factory, diag_ctx)); + } + return Tuple(res); + } else { + diag_ctx.EmitFatal(Diagnostic::Error(t->span) + << "could not build tensors using factory for type " << PrettyPrint(t)); + throw; + } +} + +template +Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like, + DiagnosticContext diag_ctx) { + if (t.as()) { + return factory_like(e); + } else if (auto* tt = t.as()) { + return MultiFactory(t, factory, diag_ctx); + } else { + diag_ctx.EmitFatal(Diagnostic::Error(t->span) + << "could not build tensors using factory for type " << PrettyPrint(t)); + throw; + } +} + +/*! \brief A fragment of the program being built by the automatic differentation + * pass. + */ +struct ADValueNode { + virtual ~ADValueNode() {} + template + T& get() { + auto ret = dynamic_cast(this); + ICHECK(ret) << "cannot downcast"; + return *ret; + } +}; + +using ADValue = std::shared_ptr; + +/*! \brief AD over a program which generates a tensor output. */ +struct ADTensor : ADValueNode { + Expr forward; + mutable Expr reverse; // must be a variable to avoid duplication + ADTensor(LetList* ll, const Expr& forward, DiagnosticContext diag_ctx) + : forward(ll->Push(forward)), + reverse(ll->Push( + MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike, diag_ctx))) { + this->forward->checked_type_ = forward->checked_type(); + } +}; + +/*! \brief A staged representation of the program, we reflect + * Relay functions into a function over fragments of AD. We + * can compute away this function to obtain a reverse mode program. + */ +struct ADFunction : ADValueNode { + // (ad_args, orig) -> ad_ret + using ADFunctionType = ADValue(const std::vector&, const Call&); + std::function func; + explicit ADFunction(const std::function& func) : func(func) {} +}; + +struct FirstOrderReverseAD : ExprFunctor { + const OpAttrMap rev_map = Op::GetAttrMap("FPrimalGradient"); + std::vector> backprop_actions; + // we assume no closure so no need for lexical scoping + std::unordered_map env; + LetList* ll; + DiagnosticContext diag_ctx; + + FirstOrderReverseAD(LetList* ll, DiagnosticContext diag_ctx) : ll(ll), diag_ctx(diag_ctx) {} + + ADValue VisitExpr(const Expr& n) final { + if (env.count(n)) { + return env.at(n); + } + auto ret = ExprFunctor::VisitExpr(n); + env[n] = ret; + return ret; + } + + static Expr LiftedAdd(const Type& t, const Expr& x, const Expr& y, LetList* ll) { + if (t.as()) { + return ll->Push(Add(x, y)); + } else if (auto* tt = t.as()) { + Array fields; + for (size_t i = 0; i < tt->fields.size(); ++i) { + fields.push_back( + LiftedAdd(tt->fields[i], ll->Push(GetField(x, i)), ll->Push(GetField(y, i)), ll)); + } + return ll->Push(Tuple(fields)); + } else { + LOG(FATAL) << "cannot lift addition for type " << PrettyPrint(t); + throw; + } + } + + ADValue VisitExpr_(const OpNode* op) final { + Op op_ref = GetRef(op); + if (!rev_map.count(op_ref)) { + diag_ctx.EmitFatal(Diagnostic::Error(op->span) + << "the operator " << op->name << " does not have a registered gradient."); + } + return std::make_shared([this, op_ref](const std::vector& ad_args, + const Call& orig) { + std::vector orig_args; + for (const ADValue& adval : ad_args) { + orig_args.push_back(adval->get().forward); + } + auto orig_new = Call(op_ref, orig_args, orig->attrs, orig->type_args); + orig_new->checked_type_ = orig->checked_type(); + auto ret = std::make_shared(ll, orig_new, diag_ctx); + backprop_actions.push_back([this, ad_args, orig_new, ret, op_ref](LetList* ll) { + tvm::Array rev = rev_map[op_ref](orig_new, ret->reverse); + if (ad_args.size() != rev.size()) { + diag_ctx.EmitFatal(Diagnostic::Error(op_ref->span) + << "arity mismatch for operator " << op_ref->name + << " and its registered gradient: expected " << ad_args.size() + << " but got " << rev.size() << " gradients."); + } + for (size_t i = 0; i < ad_args.size(); ++i) { + auto& ad_arg = ad_args[i]->get(); + ad_arg.reverse = LiftedAdd(ad_arg.forward->checked_type(), ad_arg.reverse, rev[i], ll); + } + }); + return ret; + }); + } + + ADValue VisitExpr_(const TupleGetItemNode* op) final { + Expr e = GetRef(op); + ADValue tup = VisitExpr(op->tuple); + auto tt = op->tuple->checked_type().as(); + size_t idx = op->index; + auto ret = std::make_shared(ll, e, diag_ctx); + backprop_actions.push_back([tup, tt, idx, ret](LetList* ll) { + auto& ad_tup = tup->get(); + std::vector updated_grads; + for (size_t i = 0; i < tt->fields.size(); ++i) { + Expr grad_pre = GetField(ad_tup.reverse, i); + updated_grads.push_back(i != idx ? grad_pre + : LiftedAdd(tt->fields[i], grad_pre, ret->reverse, ll)); + } + ad_tup.reverse = ll->Push(Tuple(updated_grads)); + }); + return ret; + } + + ADValue VisitExpr_(const TupleNode* op) final { + Expr e = GetRef(op); + std::vector fields; + for (const auto& f : op->fields) { + fields.push_back(VisitExpr(f)); + } + auto tt = op->checked_type().as(); + auto ret = std::make_shared(ll, e, diag_ctx); + backprop_actions.push_back([fields, tt, ret](LetList* ll) { + for (size_t i = 0; i < fields.size(); ++i) { + auto& ad_field = fields[i]->get(); + ad_field.reverse = + LiftedAdd(tt->fields[i], ad_field.reverse, GetField(ret->reverse, i), ll); + } + }); + return ret; + } + + ADValue VisitExpr_(const ConstantNode* op) final { + Expr e = GetRef(op); + return std::make_shared(ll, e, diag_ctx); + } + + ADValue VisitExpr_(const CallNode* op) final { + ADValue f = VisitExpr(op->op); + std::vector args; + for (const auto& arg : op->args) { + args.push_back(VisitExpr(arg)); + } + return f->get().func(args, GetRef(op)); + } + + ADValue VisitExpr_(const FunctionNode* op) final { + Function f = GetRef(op); + // todo: assert no closure + return std::make_shared( + [this, f](const std::vector& ad_args, const Call& orig) { + ICHECK_EQ(f->params.size(), ad_args.size()); + for (size_t i = 0; i < f->params.size(); ++i) { + env[f->params[i]] = ad_args[i]; + } + return VisitExpr(f->body); + }); + } + + // Var will always be in env, handled in VisitExpr (without _), so we don't need + // to implement its VisitExpr_. +}; + +namespace transform { + +Pass FirstOrderGradient() { + runtime::TypedPackedFunc f = [](IRModule mod, PassContext ctx) { + CheckFeature( + mod, FeatureSet({fVar, fConstant, fTuple, fTupleGetItem, fFunction, fOp, fCall, fGraph})); + IRModule ad_mod = GetRef(mod.CopyOnWrite()); + DiagnosticContext diag_ctx = DiagnosticContext::Default(ad_mod); + + if (mod->functions.size() > 1) { + LOG(WARNING) << "IRModule contains multiple global functions: first-order AD will transform " + "them indepedently!"; + } + + for (const auto& pr : mod->functions) { + const FunctionNode* func = pr.second.as(); + if (!func) { + diag_ctx.Emit(Diagnostic::Warning(pr.second->span) + << "AD can only be performed on Relay functions, skipping " + << PrettyPrint(pr.first)); + } + if (func->type_params.size() > 0) { + diag_ctx.EmitFatal(Diagnostic::Error(pr.second->span) + << "first-order AD does not support polymorphism yet."); + } + Expr body = LetList::With([&](LetList* ll) { + FirstOrderReverseAD reverse_ad(ll, diag_ctx); + ADValue rev = reverse_ad(pr.second); + std::vector args; + for (const auto& p : func->params) { + args.push_back(std::make_shared(ll, p, diag_ctx)); + } + Call placeholder = Call(GetRef(func), {}); + placeholder->checked_type_ = func->checked_type().as()->ret_type; + auto grad_call = rev->get().func(args, placeholder); + auto& res = grad_call->get(); + Expr grad_tuple = LetList::With([&](LetList* ll) { + res.reverse = + MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike, diag_ctx); + for (auto it = reverse_ad.backprop_actions.rbegin(); + it != reverse_ad.backprop_actions.rend(); ++it) { + (*it)(ll); + } + std::vector grads; + for (const auto& a : args) { + grads.push_back(a->get().reverse); + } + return Tuple(grads); + }); + return Pair(res.forward, grad_tuple); + }); + ad_mod->Update(pr.first, + Function(func->params, body, GradRetType(GetRef(func)), {})); + } + + return ad_mod; + }; + return CreateModulePass(f, 0, "FirstOrderGradient", {}); +} + +TVM_REGISTER_GLOBAL("relay._transform.FirstOrderGradient").set_body_typed(FirstOrderGradient); + +} // namespace transform + +} // namespace relay +} // namespace tvm diff --git a/src/relay/transforms/gradient.h b/src/relay/transforms/gradient.h new file mode 100644 index 0000000000000..2e6ffbcc7c9e9 --- /dev/null +++ b/src/relay/transforms/gradient.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gradient.h + * \brief Utility functions for Automatic Differentiation in Relay. + */ +#ifndef TVM_RELAY_TRANSFORMS_GRADIENT_H_ +#define TVM_RELAY_TRANSFORMS_GRADIENT_H_ + +#include +#include + +#include + +namespace tvm { +namespace relay { + +inline Type GradRetType(const Function& f) { + // if type annotations are provided, we will construct a ret type; + // otherwise, leave it to be inferred + if (!f->ret_type.defined()) { + return Type(); + } + std::vector vt; + for (const auto& p : f->params) { + if (!p->type_annotation.defined()) { + return Type(); + } + vt.push_back(p->type_annotation); + } + + return TupleType({f->ret_type, TupleType(vt)}); +} + +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_TRANSFORMS_GRADIENT_H_ diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/higher_order_gradient.cc similarity index 64% rename from src/relay/transforms/gradient.cc rename to src/relay/transforms/higher_order_gradient.cc index cd3a996553417..202275626d5d1 100644 --- a/src/relay/transforms/gradient.cc +++ b/src/relay/transforms/higher_order_gradient.cc @@ -18,8 +18,8 @@ */ /*! - * \file gradient.cc - * \brief API for Automatic Differentiation for the Relay IR. + * \file higher_order_gradient.cc + * \brief Higher-order Automatic Differentiation in Relay IR, for non-graph programs. */ #include #include @@ -28,6 +28,7 @@ #include #include +#include "gradient.h" #include "let_list.h" #include "pass_utils.h" #include "pattern_utils.h" @@ -64,13 +65,6 @@ using namespace tvm::runtime; * output. There are multiple implementation of AD in relay, with different characteristic. However, * they all transform the input expr according to WithGradientType. */ -Type WithGradientType(const Type&); - -/*! return an expression that represent differentiation of e (according to WithGradientType). - * This version only work on first order code without control flow. - */ -Expr FirstOrderGradient(const Expr& e, const Optional& mod); - Type WithGradientType(const Type& t) { // TODO(@M.K.): stricter checking auto ty = t.as(); @@ -94,268 +88,6 @@ Expr DeGlobal(const Optional& mod, const Expr& e) { } } -/*! \brief A fragment of the program being built by the automatic differentation - * pass. - */ -struct ADValueNode { - virtual ~ADValueNode() {} - template - T& get() { - auto ret = dynamic_cast(this); - ICHECK(ret) << "cannot downcast"; - return *ret; - } -}; - -template -Expr MultiFactory(const Type& t, F factory) { - if (auto* tt = t.as()) { - return factory(tt->shape, tt->dtype); - } else if (auto* tt = t.as()) { - std::vector res; - for (size_t i = 0; i < tt->fields.size(); i++) { - res.push_back(MultiFactory(tt->fields[i], factory)); - } - return Tuple(res); - } else { - LOG(FATAL) << "unsupported type to create tensors of: " << tt; - throw; - } -} - -template -Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like) { - if (t.as()) { - return factory_like(e); - } else if (auto* tt = t.as()) { - return MultiFactory(t, factory); - } else { - LOG(FATAL) << "unsupported type to tensors of: " << tt; - throw; - } -} - -using ADValue = std::shared_ptr; - -/*! \brief AD over a program which generates a tensor output. */ -struct ADTensor : ADValueNode { - Expr forward; - mutable Expr reverse; // must be a variable to avoid duplication - ADTensor(LetList* ll, const Expr& forward) - : forward(ll->Push(forward)), - reverse( - ll->Push(MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike))) { - this->forward->checked_type_ = forward->checked_type(); - } -}; - -/*! \brief A staged representation of the program, we reflect - * Relay functions into a function over fragments of AD. We - * can compute away this function to obtain a reverse mode program. - */ -struct ADFunction : ADValueNode { - std::function&, const Attrs&, - const tvm::Array&)> - func; - explicit ADFunction(const std::function&, - const Attrs&, const tvm::Array&)>& func) - : func(func) {} -}; - -struct FirstOrderReverseAD : ExprFunctor { - using TBase = ExprFunctor; - const OpAttrMap rev_map = Op::GetAttrMap("FPrimalGradient"); - std::vector> backprop_actions; - // we assume no closure so no need for lexical scoping - std::unordered_map env; - LetList* ll; - - FirstOrderReverseAD(LetList* ll) : ll(ll) {} - - ADValue VisitExpr(const Expr& n) final { - if (env.count(n)) { - return env.at(n); - } - auto ret = TBase::VisitExpr(n); - env[n] = ret; - return ret; - } - - Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) { - if (t.as()) { - return ll->Push(Add(arg, grad)); - } else if (auto* tt = t.as()) { - Array updates; - for (size_t i = 0; i < tt->fields.size(); ++i) { - updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)), - ll->Push(GetField(grad, i)), ll)); - } - return ll->Push(Tuple(updates)); - } else { - LOG(FATAL) << "unsupported arg type of operator: " << t; - throw; - } - } - - ADValue VisitExpr_(const OpNode* op) final { - Op op_ref = GetRef(op); - ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined"; - return std::make_shared( - [this, op_ref](const Type& orig_type, const std::vector& args, const Attrs& attrs, - const tvm::Array& type_args) { - std::vector call_args; - for (const ADValue& adval : args) { - call_args.push_back(adval->get().forward); - } - auto orig = Call(op_ref, call_args, attrs, type_args); - orig->checked_type_ = orig_type; - auto ret = std::make_shared(ll, orig); - backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) { - tvm::Array rev = rev_map[op_ref](orig, ret->reverse); - ICHECK(args.size() == rev.size()); - for (size_t i = 0; i < args.size(); ++i) { - auto ad_arg = args[i]->get(); - auto ad_arg_type = ad_arg.forward->checked_type(); - args[i]->get().reverse = - this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll); - } - }); - return ret; - }); - } - - ADValue VisitExpr_(const TupleGetItemNode* op) final { - Expr e = GetRef(op); - ADValue tup = VisitExpr(op->tuple); - auto tt = op->tuple->checked_type().as(); - size_t size = tt->fields.size(); - size_t idx = op->index; - auto ret = std::make_shared(ll, e); - backprop_actions.push_back([tup, idx, size, ret](LetList* ll) { - auto rev = tup->get().reverse; - // special-case Tuple, to avoid long chains of GetItem/Tuple, - // but we might have functions using tuples, so we don't know - // that the reverse node is always a tuple - std::vector grfields; - if (auto tup_node = rev.as()) { - for (size_t i = 0; i < size; ++i) { - grfields.push_back(i != idx ? tup_node->fields[i] - : Add(tup_node->fields[i], ret->reverse)); - } - } else { - for (size_t i = 0; i < size; ++i) { - grfields.push_back(i != idx ? TupleGetItem(rev, i) - : Add(TupleGetItem(rev, i), ret->reverse)); - } - } - tup->get().reverse = ll->Push(Tuple(grfields)); - }); - return ret; - } - - ADValue VisitExpr_(const TupleNode* op) final { - Expr e = GetRef(op); - std::vector fields; - for (const auto& f : op->fields) { - fields.push_back(VisitExpr(f)); - } - auto ret = std::make_shared(ll, e); - backprop_actions.push_back([fields, ret](LetList* ll) { - for (size_t i = 0; i < fields.size(); ++i) { - fields[i]->get().reverse = - ll->Push(Add(fields[i]->get().reverse, TupleGetItem(ret->reverse, i))); - } - }); - return ret; - } - - ADValue VisitExpr_(const ConstantNode* op) final { - Expr e = GetRef(op); - return std::make_shared(ll, e); - } - - ADValue VisitExpr_(const CallNode* op) final { - ADValue f = VisitExpr(op->op); - std::vector args; - for (const auto& arg : op->args) { - args.push_back(VisitExpr(arg)); - } - return f->get().func(op->checked_type(), args, op->attrs, op->type_args); - } - - ADValue VisitExpr_(const FunctionNode* op) final { - Function f = GetRef(op); - // todo: assert no closure - return std::make_shared( - [this, f](const Type& orig_type, const std::vector& args, const Attrs& attrs, - const tvm::Array& type_args) { - ICHECK_EQ(f->params.size(), args.size()); - for (size_t i = 0; i < f->params.size(); ++i) { - env[f->params[i]] = args[i]; - } - return VisitExpr(f->body); - }); - } - - // Var will always be in env, handled in VisitExpr (without _), so we don't need - // to implement its VisitExpr_. -}; - -Type GradRetType(const Function& f) { - // if type annotations are provided, we will construct a ret type; - // otherwise, leave it to be inferred - if (!f->ret_type.defined()) { - return Type(); - } - std::vector vt; - for (const auto& p : f->params) { - if (!p->type_annotation.defined()) { - return Type(); - } - vt.push_back(p->type_annotation); - } - - return TupleType({f->ret_type, TupleType(vt)}); -} - -Expr FirstOrderGradient(const Expr& re, const Optional& mod) { - // Currently we first remove any global functions for the first - // order case. - auto e = DeGlobal(mod, re); - auto f = e.as(); - ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f; - ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now"; - - // We will then build a sequence of lets which implement reverse mode. - Expr body = LetList::With([&](LetList* ll) { - FirstOrderReverseAD reverse_ad(ll); - ADValue rev = reverse_ad(e); - std::vector args; - for (const auto& p : f->params) { - args.push_back(std::make_shared(ll, p)); - } - auto c = rev->get().func(f->checked_type(), args, Attrs(), {}); - const auto& res = c->get(); - Expr grad = LetList::With([&](LetList* ll) { - res.reverse = MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike); - for (auto it = reverse_ad.backprop_actions.rbegin(); it != reverse_ad.backprop_actions.rend(); - ++it) { - (*it)(ll); - } - std::vector grad_res; - for (const auto& a : args) { - grad_res.push_back(a->get().reverse); - } - return Tuple(grad_res); - }); - return Pair(res.forward, grad); - }); - - return Function(f->params, body, GradRetType(GetRef(f)), {}); -} - -TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient); - static Type bpt = RelayRefType(FuncType({}, TupleType(Array()), {}, {})); struct ReverseADType : TypeMutator { From 431a7d6c0b7e5ae71b411c500836b136322f9fbf Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Thu, 18 Mar 2021 16:44:45 -0700 Subject: [PATCH 40/84] Default value for graph_runtime Init lookup_linked_param_func (#7676) --- src/runtime/graph/graph_runtime.cc | 5 +++-- src/runtime/graph/graph_runtime.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 7e98acb6fb3ee..5c7b756961684 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -66,10 +66,11 @@ void GraphRuntime::Run() { * processor. * \param ctxs The context of the host and devices where graph nodes will be * executed on. - * \param lookup_linked_param_func Linked parameter lookup function. + * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr. */ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, PackedFunc lookup_linked_param_func) { + const std::vector& ctxs, + const PackedFunc lookup_linked_param_func) { std::istringstream is(graph_json); dmlc::JSONReader reader(&is); this->Load(&reader); diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h index a1e2ee3b5d744..e417d2aa4bfcc 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph/graph_runtime.h @@ -93,11 +93,12 @@ class TVM_DLL GraphRuntime : public ModuleNode { * executed on. * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters * by storage_id. If not given, linked parameters are looked-up using an internal implementation, - * which is not compatible with RPCModules. + * which is not compatible with RPCModules. Default is nullptr. */ void Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, const PackedFunc lookup_linked_param_func); + const std::vector& ctxs, + const PackedFunc lookup_linked_param_func = nullptr); /*! * \brief Get the input index given the name of input. From e467748ef46836d94fd48a9673e8f05b305afe4c Mon Sep 17 00:00:00 2001 From: eric Date: Fri, 19 Mar 2021 12:13:04 +0900 Subject: [PATCH 41/84] [CPP_RPC] allow user supplied work dir (#7670) * [CPP_RPC] allow user supplied work dir * clang format --- apps/cpp_rpc/main.cc | 10 +++++++++- apps/cpp_rpc/rpc_env.cc | 35 ++++++++++++++++++++--------------- apps/cpp_rpc/rpc_env.h | 2 +- apps/cpp_rpc/rpc_server.cc | 21 ++++++++++++--------- apps/cpp_rpc/rpc_server.h | 3 ++- 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc index e381dd2b261b9..0663c378819e2 100644 --- a/apps/cpp_rpc/main.cc +++ b/apps/cpp_rpc/main.cc @@ -55,6 +55,7 @@ static const string kUsage = "--tracker - The RPC tracker address in host:port format e.g. 10.1.1.2:9190 Default=\"\"\n" "--key - The key used to identify the device type in tracker. Default=\"\"\n" "--custom-addr - Custom IP Address to Report to RPC Tracker. Default=\"\"\n" + "--work-dir - Custom work directory. Default=\"\"\n" "--silent - Whether to run in silent mode. Default=False\n" "\n" " Example\n" @@ -70,6 +71,7 @@ static const string kUsage = * \arg tracker The address of RPC tracker in host:port format e.g. 10.77.1.234:9190 Default="" * \arg key The key used to identify the device type in tracker. Default="" * \arg custom_addr Custom IP Address to Report to RPC Tracker. Default="" + * \arg work_dir Custom work directory. Default="" * \arg silent Whether run in silent mode. Default=False */ struct RpcServerArgs { @@ -79,6 +81,7 @@ struct RpcServerArgs { string tracker; string key; string custom_addr; + string work_dir; bool silent = false; #if defined(WIN32) std::string mmap_path; @@ -96,6 +99,7 @@ void PrintArgs(const RpcServerArgs& args) { LOG(INFO) << "tracker = " << args.tracker; LOG(INFO) << "key = " << args.key; LOG(INFO) << "custom_addr = " << args.custom_addr; + LOG(INFO) << "work_dir = " << args.work_dir; LOG(INFO) << "silent = " << ((args.silent) ? ("True") : ("False")); } @@ -238,6 +242,10 @@ void ParseCmdArgs(int argc, char* argv[], struct RpcServerArgs& args) { dmlc::InitLogging("--minloglevel=0"); } #endif + const string work_dir = GetCmdOption(argc, argv, "--work-dir="); + if (!work_dir.empty()) { + args.work_dir = work_dir; + } } /*! @@ -274,7 +282,7 @@ int RpcServer(int argc, char* argv[]) { #endif RPCServerCreate(args.host, args.port, args.port_end, args.tracker, args.key, args.custom_addr, - args.silent); + args.work_dir, args.silent); return 0; } diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc index ea19cfa3979d3..5f703e1dc2b01 100644 --- a/apps/cpp_rpc/rpc_env.cc +++ b/apps/cpp_rpc/rpc_env.cc @@ -39,7 +39,6 @@ int mkdir(const char* path, int /* ignored */) { return _mkdir(path); } #include #include #include - #include "../../src/support/utils.h" #include "rpc_env.h" @@ -85,25 +84,31 @@ void CleanDir(const std::string& dirname); */ std::string BuildSharedLibrary(std::string file_in); -RPCEnv::RPCEnv() { +RPCEnv::RPCEnv(const std::string& wd) { + if (wd != "") { + base_ = wd + "/.cache"; + mkdir(wd.c_str(), 0777); + mkdir(base_.c_str(), 0777); + } else { #if defined(ANDROID) || defined(__ANDROID__) - char cwd[PATH_MAX]; - auto cmdline = fopen("/proc/self/cmdline", "r"); - fread(cwd, 1, sizeof(cwd), cmdline); - fclose(cmdline); - base_ = "/data/data/" + std::string(cwd) + "/cache/rpc"; + char cwd[PATH_MAX]; + auto cmdline = fopen("/proc/self/cmdline", "r"); + fread(cwd, 1, sizeof(cwd), cmdline); + fclose(cmdline); + base_ = "/data/data/" + std::string(cwd) + "/cache/rpc"; #elif !defined(_WIN32) - char cwd[PATH_MAX]; - if (getcwd(cwd, sizeof(cwd))) { - base_ = std::string(cwd) + "/rpc"; - } else { - base_ = "./rpc"; - } + char cwd[PATH_MAX]; + if (getcwd(cwd, sizeof(cwd))) { + base_ = std::string(cwd) + "/rpc"; + } else { + base_ = "./rpc"; + } #else - base_ = "./rpc"; + base_ = "./rpc"; #endif + mkdir(base_.c_str(), 0777); + } - mkdir(base_.c_str(), 0777); TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath").set_body([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetPath(args[0]); }); diff --git a/apps/cpp_rpc/rpc_env.h b/apps/cpp_rpc/rpc_env.h index 50ef3835e015d..dbb0a62d2c5d1 100644 --- a/apps/cpp_rpc/rpc_env.h +++ b/apps/cpp_rpc/rpc_env.h @@ -39,7 +39,7 @@ struct RPCEnv { /*! * \brief Constructor Init The RPC Environment initialize function */ - RPCEnv(); + RPCEnv(const std::string& word_dir = ""); /*! * \brief GetPath To get the workpath from packed function * \param name The file name diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc index a4028ff61eca8..52b5da965b4cb 100644 --- a/apps/cpp_rpc/rpc_server.cc +++ b/apps/cpp_rpc/rpc_server.cc @@ -98,14 +98,15 @@ class RPCServer { * \brief Constructor. */ RPCServer(std::string host, int port, int port_end, std::string tracker_addr, std::string key, - std::string custom_addr) + std::string custom_addr, std::string work_dir) : host_(std::move(host)), port_(port), my_port_(0), port_end_(port_end), tracker_addr_(std::move(tracker_addr)), key_(std::move(key)), - custom_addr_(std::move(custom_addr)) {} + custom_addr_(std::move(custom_addr)), + work_dir_(std::move(work_dir)) {} /*! * \brief Destructor. @@ -174,7 +175,7 @@ class RPCServer { const pid_t worker_pid = fork(); if (worker_pid == 0) { // Worker process - ServerLoopProc(conn, addr); + ServerLoopProc(conn, addr, work_dir_); _exit(0); } @@ -201,7 +202,7 @@ class RPCServer { } else { auto pid = fork(); if (pid == 0) { - ServerLoopProc(conn, addr); + ServerLoopProc(conn, addr, work_dir_); exit(0); } // Wait for the result @@ -308,9 +309,10 @@ class RPCServer { * \param sock The socket information * \param addr The socket address information */ - static void ServerLoopProc(support::TCPSocket sock, support::SockAddr addr) { + static void ServerLoopProc(support::TCPSocket sock, support::SockAddr addr, + std::string work_dir) { // Server loop - const auto env = RPCEnv(); + const auto env = RPCEnv(work_dir); RPCServerLoop(int(sock.sockfd)); LOG(INFO) << "Finish serving " << addr.AsString(); env.CleanUp(); @@ -339,6 +341,7 @@ class RPCServer { std::string tracker_addr_; std::string key_; std::string custom_addr_; + std::string work_dir_; support::TCPSocket listen_sock_; support::TCPSocket tracker_sock_; }; @@ -370,19 +373,19 @@ void ServerLoopFromChild(SOCKET socket) { * silent mode. Default=True */ void RPCServerCreate(std::string host, int port, int port_end, std::string tracker_addr, - std::string key, std::string custom_addr, bool silent) { + std::string key, std::string custom_addr, std::string work_dir, bool silent) { if (silent) { // Only errors and fatal is logged dmlc::InitLogging("--minloglevel=2"); } // Start the rpc server RPCServer rpc(std::move(host), port, port_end, std::move(tracker_addr), std::move(key), - std::move(custom_addr)); + std::move(custom_addr), std::move(work_dir)); rpc.Start(); } TVM_REGISTER_GLOBAL("rpc.ServerCreate").set_body([](TVMArgs args, TVMRetValue* rv) { - RPCServerCreate(args[0], args[1], args[2], args[3], args[4], args[5], args[6]); + RPCServerCreate(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7]); }); } // namespace runtime } // namespace tvm diff --git a/apps/cpp_rpc/rpc_server.h b/apps/cpp_rpc/rpc_server.h index 7a4bda5d65c41..e4565d095b2e1 100644 --- a/apps/cpp_rpc/rpc_server.h +++ b/apps/cpp_rpc/rpc_server.h @@ -48,11 +48,12 @@ void ServerLoopFromChild(SOCKET socket); * \param tracker The address of RPC tracker in host:port format e.g. 10.77.1.234:9190 Default="" * \param key The key used to identify the device type in tracker. Default="" * \param custom_addr Custom IP Address to Report to RPC Tracker. Default="" + * \param work_dir Custom work directory. Default="" * \param silent Whether run in silent mode. Default=True */ void RPCServerCreate(std::string host = "", int port = 9090, int port_end = 9099, std::string tracker_addr = "", std::string key = "", - std::string custom_addr = "", bool silent = true); + std::string custom_addr = "", std::string work_dir = "", bool silent = true); } // namespace runtime } // namespace tvm #endif // TVM_APPS_CPP_RPC_SERVER_H_ From 2ee860e902e77f45996a5585fc09c5e5c29788e1 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Fri, 19 Mar 2021 06:47:45 +0000 Subject: [PATCH 42/84] [TFLite] Cast operator adapted for MLIR-based convertor (#7639) * [TFLite] Cast operator adapted for MLIR-based convertor Cast operator now can be executed in MLIR-based version. Unit test updated Change-Id: I30e5c1c9d69355116b560af8f6d0582b2d593538 * Comment added Change-Id: I3e2d29ef201283de337168d0b82679b63ca2fcf4 --- python/tvm/relay/frontend/tflite.py | 17 ++++++++++++----- tests/python/frontend/tflite/test_forward.py | 19 ++++++++++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index d6f704703cae8..a5c9a586e2753 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -2336,11 +2336,18 @@ def convert_cast(self, op): input_tensor = input_tensors[0] in_expr = self.get_expr(input_tensor.tensor_idx) - assert op.BuiltinOptionsType() == BuiltinOptions.CastOptions - op_options = op.BuiltinOptions() - cast_options = CastOptions() - cast_options.Init(op_options.Bytes, op_options.Pos) - cast_dtype = cast_options.OutDataType() + # MLIR-based converter outputs no BuiltinOptions for Cast operator. In this + # case the output type can be derived from the Cast operator output tensor. + # When TOCO converter is used there will be "normal" BuiltinOptions.CastOptions + # with output type. + if op.BuiltinOptions() is not None: + assert op.BuiltinOptionsType() == BuiltinOptions.CastOptions + op_options = op.BuiltinOptions() + cast_options = CastOptions() + cast_options.Init(op_options.Bytes, op_options.Pos) + cast_dtype = cast_options.OutDataType() + else: + cast_dtype = self.get_output_tensors(op)[0].tensor.Type() out = _op.cast(in_expr, self.get_tensor_type_str(cast_dtype)) diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 0d02c15f2eb82..7c12cd3365cab 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -647,19 +647,28 @@ def test_forward_transpose(): # ---- -def _test_cast(data, cast_dtype): +def _test_cast(data, cast_dtype, use_mlir=False): """ One iteration of CAST """ with tf.Graph().as_default(): in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) out = math_ops.cast(in_data, cast_dtype) - compare_tflite_with_tvm(data, "Placeholder:0", [in_data], [out]) + compare_tflite_with_tvm( + data, "Placeholder:0", [in_data], [out], experimental_new_converter=use_mlir + ) def test_forward_cast(): """ CAST """ - _test_cast(np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.int32) - _test_cast(np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.uint8) - _test_cast(np.arange(6.0, dtype=np.int32).reshape((1, 6)), cast_dtype=tf.int64) + for use_mlir in [False, True]: + _test_cast( + np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.int32, use_mlir=use_mlir + ) + _test_cast( + np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.uint8, use_mlir=use_mlir + ) + _test_cast( + np.arange(6.0, dtype=np.int32).reshape((1, 6)), cast_dtype=tf.int64, use_mlir=use_mlir + ) ####################################################################### From 570767f78851fbc0472c230adcb2c98e47bad0e8 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Fri, 19 Mar 2021 01:09:45 -0700 Subject: [PATCH 43/84] Free TensorRT engine and context (#7702) --- src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index 3f87f8d00ee66..e28c5a8c61d04 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -109,6 +109,14 @@ class TensorRTRuntime : public JSONRuntimeBase { } #ifdef TVM_GRAPH_RUNTIME_TENSORRT + /*! \brief Destroy engines and contexts. */ + ~TensorRTRuntime() { + for (auto& it : trt_engine_cache_) { + it.second.context->destroy(); + it.second.engine->destroy(); + } + } + /*! \brief Run inference using built engine. */ void Run() override { BuildEngine(); From 35b43e1837cd5fcd688798cc3bf60ccc7f08bfbc Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Fri, 19 Mar 2021 02:32:58 -0700 Subject: [PATCH 44/84] Change behavior of onnx importer to throw when user provides an input no in the graph. (#7699) --- python/tvm/relay/frontend/onnx.py | 7 +++- tests/python/frontend/onnx/test_forward.py | 39 ++++++++++++++++++---- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 391eaaab5f64b..fab4ae889dd7e 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -2914,7 +2914,7 @@ def from_onnx(self, graph, opset, get_output_expr=False): else: self._num_input += 1 if i_name in self._shape: - i_shape = self._shape[i_name] + i_shape = self._shape.pop(i_name) else: if "?" in str(i_shape): warning_msg = ( @@ -2929,6 +2929,11 @@ def from_onnx(self, graph, opset, get_output_expr=False): dtype = d_type self._nodes[i_name] = new_var(i_name, shape=i_shape, dtype=dtype) self._inputs[i_name] = self._nodes[i_name] + assert ( + len(self._shape) == 0 + ), "User specified the shape for inputs that weren't found in the graph: " + str( + self._shape + ) # get list of unsupported ops convert_map = _get_convert_map(opset) unsupported_ops = set() diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 177bed66f466b..5a6216ac705de 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -19,6 +19,7 @@ from onnx import helper, TensorProto, mapping, numpy_helper import torch import torchvision +import pytest import tvm.topi.testing import tvm from tvm import relay @@ -57,7 +58,7 @@ def get_tvm_output_with_vm( mod = relay.transform.DynamicToStatic()(mod) ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) - result = ex.evaluate()(*input_data) + result = ex.evaluate()(*input_data, **params) if isinstance(result, tvm.runtime.NDArray): return result.asnumpy() return [r.asnumpy() for r in result] @@ -500,7 +501,7 @@ def test_squeeze(): model = helper.make_model(graph, producer_name="squeeze_test") x = np.random.uniform(size=in_shape).astype("float32") - verify_with_ort_with_inputs(model, [x], [out_shape]) + verify_with_ort_with_inputs(model, [x], [out_shape], opset=11) @tvm.testing.uses_gpu @@ -538,7 +539,7 @@ def test_unsqueeze(): ) model = helper.make_model(graph, producer_name="squeeze_test") - verify_with_ort(model, [in_shape]) + verify_with_ort(model, [in_shape], opset=11) def verify_gather(in_shape, indices, axis, dtype): @@ -1584,7 +1585,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0): pads = np.array(pads) # onnx graph if mode in ["edge", "reflect"]: - inputs = [indata, pads] + inputs = [indata] outdata = np.pad(indata, pad_width=np_pads, mode=mode) node = helper.make_node("Pad", inputs=["input", "pads"], outputs=["output"], mode=mode) graph = helper.make_graph( @@ -1600,7 +1601,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0): ], ) else: - inputs = [indata, pads, np.array([value]).astype("float32")] + inputs = [indata] outdata = np.pad(indata, pad_width=np_pads, mode="constant", constant_values=value) node = helper.make_node( "Pad", inputs=["input", "pads", "constant_value"], outputs=["output"], mode="constant" @@ -1663,7 +1664,7 @@ def verify_reduce_func(func, data, axis, keepdims): model = helper.make_model(graph, producer_name="reduce_test") - verify_with_ort_with_inputs(model, [data], [outshape]) + verify_with_ort_with_inputs(model, [data], [outshape], opset=11) @tvm.testing.uses_gpu @@ -4089,6 +4090,31 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"): verify_cumsum(data, 1, 1, 1, type="int32") +def test_wrong_input(): + node = helper.make_node( + "Softplus", + inputs=["X"], + outputs=["Y"], + ) + + graph = helper.make_graph( + [node], + "softplus_test", + inputs=[helper.make_tensor_value_info("X", TensorProto.FLOAT, list([5]))], + outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, list([5]))], + ) + model = helper.make_model(graph, producer_name="softplus_test") + + # Check that the graph can import correctly with proper shape definitions. + correct_shape_dict = {"X": [5]} + relay.frontend.from_onnx(model, shape=correct_shape_dict) + + # Check that an assertion is triggered when an input not in the graph is provided. + wrong_shape_dict = {"Z": [5]} + with pytest.raises(AssertionError): + relay.frontend.from_onnx(model, shape=wrong_shape_dict) + + if __name__ == "__main__": test_flatten() test_reshape() @@ -4167,3 +4193,4 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"): test_maxunpool() test_softplus() test_cumsum() + test_wrong_input() From 9a29141db81a4128f49c84b5a2ad50325eb6c7bd Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 20 Mar 2021 05:44:05 +0900 Subject: [PATCH 45/84] [Vulkan] Workaround for zero size allocation (#7691) --- src/runtime/vulkan/vulkan.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index ff1b82f930d73..f56318aee94d7 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -120,6 +120,10 @@ class VulkanDeviceAPI final : public DeviceAPI { std::vector GetComputeQueueFamilies(VkPhysicalDevice phy_dev); void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final { + if (nbytes == 0) { + // Vulkan seems to have issues if we return nullptr on zero size alloc + nbytes = 1; + } const auto& vctx = context(ctx.device_id); VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; From aa494cfbfd0943855889444f37e7f032b0b58051 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 19 Mar 2021 15:19:25 -0700 Subject: [PATCH 46/84] [AutoScheduler] Add function name in message (#7703) * [AutoScheduler] Add function name in message * fix --- python/tvm/auto_scheduler/dispatcher.py | 49 +++++++++++-------- .../tvm/auto_scheduler/relay_integration.py | 7 ++- src/relay/backend/compile_engine.cc | 2 +- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py index 6a25960fe7b74..c843dcfccdf05 100644 --- a/python/tvm/auto_scheduler/dispatcher.py +++ b/python/tvm/auto_scheduler/dispatcher.py @@ -50,7 +50,7 @@ class DispatchContext(object): def __init__(self): self._old_ctx = DispatchContext.current - def query(self, target, workload_key, has_complex_op, dag): + def query(self, target, workload_key, has_complex_op, dag, func_name): """ Query the context to get the specific config for a workload. If cannot find the result inside this context, this function will query it @@ -66,15 +66,17 @@ def query(self, target, workload_key, has_complex_op, dag): Whether this workload has at least one complex op. dag: ComputeDAG The ComputeDAG of the workload. + func_name: str + The function name of this workload. Returns ------- state : StateObject The state that stores schedule configuration for the workload """ - ret = self._query_inside(target, workload_key) + ret = self._query_inside(target, workload_key, func_name) if ret is None: - ret = self._old_ctx.query(target, workload_key, has_complex_op, dag) + ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name) return ret def update(self, target, workload_key, state): @@ -92,7 +94,7 @@ def update(self, target, workload_key, state): """ raise NotImplementedError() - def _query_inside(self, target, workload_key): + def _query_inside(self, target, workload_key, func_name): """ Query the context to get the specific config for a workload. This function only query config inside this context. @@ -103,6 +105,8 @@ def _query_inside(self, target, workload_key): The current target workload_key : str The current workload_key. + func_name: str + The function name of this workload. Returns ------- @@ -241,7 +245,7 @@ def load(self, records, n_lines=None): logger.debug("Finish loading %d records", counter) - def _query_inside(self, target, workload_key): + def _query_inside(self, target, workload_key, func_name): if target is None: raise RuntimeError( "Need a target context to find the history best. " @@ -343,18 +347,20 @@ def __init__( records, n_lines=None, include_compatible=True ) - def query(self, target, workload_key, has_complex_op, dag): + def query(self, target, workload_key, has_complex_op, dag, func_name): if has_complex_op or self.sample_simple_workloads: - ret = self._query_inside(target, workload_key) + ret = self._query_inside(target, workload_key, func_name) else: - ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + ret = super(ApplyHistoryBestOrSample, self)._query_inside( + target, workload_key, func_name + ) if ret is None: - ret = self._old_ctx.query(target, workload_key, has_complex_op, dag) + ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name) return ret - def _query_inside(self, target, workload_key): - ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + def _query_inside(self, target, workload_key, func_name): + ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key, func_name) if ret is not None: return ret @@ -386,7 +392,9 @@ def _query_inside(self, target, workload_key): # Load the sampled records and query again. self.load(log_file) - ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + ret = super(ApplyHistoryBestOrSample, self)._query_inside( + target, workload_key, func_name + ) del measure_ctx return ret @@ -411,18 +419,19 @@ def __init__(self): # a set to prevent print duplicated message self.messages = set() - def query(self, target, workload_key, has_complex_op, dag): + def query(self, target, workload_key, has_complex_op, dag, func_name): key = (str(target), workload_key) if key in self.memory: return self.memory[key] if self.verbose == 2 or (has_complex_op and self.verbose == 1): msg = ( - "-----------------------------------\n" - "Cannot find tuned schedules for target=%s, workload_key=%s. " - "A fallback TOPI schedule is used, " - "which may bring great performance regression or even compilation failure. " - "Compute DAG info:\n%s" % (target, workload_key, dag) + f"-----------------------------------\n" + f"{func_name}\n" + f"Cannot find tuned schedules for target={target}, workload_key={workload_key}. " + f"A fallback TOPI schedule is used, " + f"which may bring great performance regression or even compilation failure. " + f"Compute DAG info:\n{dag}" ) if msg not in self.messages: self.messages.add(msg) @@ -434,8 +443,8 @@ def query(self, target, workload_key, has_complex_op, dag): self.memory[key] = state return state - def _query_inside(self, target, workload_key): - _ = target = workload_key + def _query_inside(self, target, workload_key, func_name): + _ = target = workload_key = func_name raise RuntimeError("This function should never be called") def update(self, target, workload_key, state): diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 6cce30f2f5599..366d3d021d9e5 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -256,7 +256,7 @@ def traverse(t): @tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute") -def auto_schedule_topi(outs): +def auto_schedule_topi(func_name, outs): """Use auto-scheduler to schedule any topi compute function. Note: This is used internally for relay integration. Do @@ -264,6 +264,9 @@ def auto_schedule_topi(outs): Parameters ---------- + func_name: str + The name of the function being scheduled. + outs: List[Tensor] The output tensors of topi compute functions @@ -289,7 +292,7 @@ def auto_schedule_topi(outs): target = tvm.target.Target.current() dispatch_ctx = DispatchContext.current - state = dispatch_ctx.query(target, key, has_complex_op, dag) + state = dispatch_ctx.query(target, key, has_complex_op, dag, func_name) schedule = None env = TracingEnvironment.current diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index ae975a5f32401..f492b70565ace 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -157,7 +157,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute"); ICHECK(fauto_schedule != nullptr) << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered"; - ObjectRef obj = (*fauto_schedule)(tensor_outs); + ObjectRef obj = (*fauto_schedule)(String(cache_node->func_name), tensor_outs); if (obj.defined()) { schedule = Downcast(obj); } From 7605f65569df56bc03f6648ff4146bb67b4c4b8e Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Fri, 19 Mar 2021 19:25:58 -0700 Subject: [PATCH 47/84] [TOPI][CUDA] Fix 0 valid boxes case for NMS when return_indices=False (#7700) * Handle 0 box case for return_indices=False case * Add unit test for mx NMS --- python/tvm/topi/cuda/nms.py | 4 +- tests/python/frontend/mxnet/test_forward.py | 60 +++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index ccc2ec9d0c219..cc29d4fb54ad6 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -474,7 +474,9 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): box_indices[i * num_anchors + j] = -1 with ib.else_scope(): - with ib.if_scope(j < valid_count[i]): + # Need to copy all boxes if not using return_indices + bounds = valid_count[i] if return_indices else num_anchors + with ib.if_scope(j < bounds): src_offset = base_src_idx + j * box_data_length with ib.for_range(0, 4, kind="unroll") as k: diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py index 4eb7f6139e8fc..3a299618fbb84 100644 --- a/tests/python/frontend/mxnet/test_forward.py +++ b/tests/python/frontend/mxnet/test_forward.py @@ -1887,6 +1887,66 @@ def verify(batch, seq_length, num_heads, head_dim): verify(3, 10, 6, 8) +@tvm.testing.uses_gpu +def test_forward_box_nms(): + def verify( + data_shape, + overlap_thresh=0.5, + valid_thresh=0, + topk=1, + coord_start=2, + score_index=1, + id_index=0, + force_suppress=False, + in_format="corner", + ): + dtype = "float32" + data = np.random.uniform(low=0, high=1, size=data_shape).astype(dtype) + ref_res = mx.nd.contrib.box_nms( + mx.nd.array(data), + overlap_thresh=overlap_thresh, + valid_thresh=valid_thresh, + topk=topk, + coord_start=coord_start, + score_index=score_index, + id_index=id_index, + force_suppress=force_suppress, + background_id=-1, + in_format=in_format, + out_format=in_format, + ) + mx_sym = mx.sym.contrib.box_nms( + mx.sym.var("data"), + overlap_thresh=overlap_thresh, + valid_thresh=valid_thresh, + topk=topk, + coord_start=coord_start, + score_index=score_index, + id_index=id_index, + force_suppress=force_suppress, + background_id=-1, + in_format=in_format, + out_format=in_format, + ) + shape_dict = {"data": data_shape} + mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) + for target, ctx in tvm.testing.enabled_targets(): + if tvm.contrib.thrust.can_use_thrust( + tvm.target.Target(target + " -libs=thrust"), "tvm.contrib.thrust.sort" + ): + target += " -libs=thrust" + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + op_res = intrp.evaluate()(data) + tvm.testing.assert_allclose( + op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 + ) + + verify((1, 10, 6)) + # No valid boxes + verify((1, 10, 6), valid_thresh=1) + + @tvm.testing.uses_gpu def test_forward_box_decode(): def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corner"): From 10cd83d0a4db708ea3e494f47a4fed308369aec6 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 20 Mar 2021 04:27:48 -0400 Subject: [PATCH 48/84] [RUNTIME] Cleanup build for libbacktrace (#7706) * [RUNTIME] Cleanup build for libbacktrace - Introduce TVM_USE_LIBBACKTRACE value macro to be consistent wth other value macros(instead of relying on disabled flag). - Introduce AUTO mode for libbacktrace - Temporary disable MacOS support in light of recent bug report. - Refactor out the libbacktrace.cmake to libs - Properly use TVM_DLL so that code is cross platform. - Fallback to the weaker dmlc impl when backtrace is disabled. * Update Logging.cmake * Update the macro check order to be consistent with the rest. --- CMakeLists.txt | 25 ++-------- cmake/config.cmake | 10 ++-- cmake/{modules => libs}/Libbacktrace.cmake | 0 cmake/modules/Logging.cmake | 46 +++++++++++++++++++ conda/recipe/build.sh | 1 + include/tvm/runtime/logging.h | 29 ++++++++---- src/runtime/logging.cc | 28 +++++------ tests/scripts/task_config_build_cpu.sh | 1 + tests/scripts/task_config_build_gpu.sh | 1 + tests/scripts/task_config_build_gpu_vulkan.sh | 1 + 10 files changed, 96 insertions(+), 46 deletions(-) rename cmake/{modules => libs}/Libbacktrace.cmake (100%) create mode 100644 cmake/modules/Logging.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 1aa3e68ffd14b..6d37bd4e6e446 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,11 +48,7 @@ tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF) tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF) tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF) tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON) -set(_LIBBACKTRACE_DEFAULT OFF) -if(CMAKE_SYSTEM_NAME MATCHES "Darwin" OR CMAKE_SYSTEM_NAME MATCHES "Linux") - set(_LIBBACKTRACE_DEFAULT ON) -endif() -tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" ${_LIBBACKTRACE_DEFAULT}) +tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -395,26 +391,13 @@ add_library(tvm SHARED $ $) set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}") - target_compile_definitions(tvm_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) target_compile_definitions(tvm_runtime_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=) target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=) -if(USE_LIBBACKTRACE) - message(STATUS "Building with libbacktrace...") - include(cmake/modules/Libbacktrace.cmake) - target_link_libraries(tvm PRIVATE libbacktrace) - target_link_libraries(tvm_runtime PRIVATE libbacktrace) - add_dependencies(tvm_runtime_objs libbacktrace) - # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually - target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") - target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") - target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") - target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") -else() - target_compile_definitions(tvm_objs PRIVATE TVM_BACKTRACE_DISABLED) - target_compile_definitions(tvm_runtime_objs PRIVATE TVM_BACKTRACE_DISABLED) -endif() + +# logging option for libbacktrace +include(cmake/modules/Logging.cmake) if(USE_MICRO) # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the diff --git a/cmake/config.cmake b/cmake/config.cmake index 8c090dce741eb..98d1d97801037 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -277,6 +277,10 @@ set(USE_TARGET_ONNX OFF) set(USE_BNNS OFF) # Whether to use libbacktrace -# Libbacktrace provides line and column information on stack traces from errors. It is only -# supported on linux and macOS. -# set(USE_LIBBACKTRACE OFF) +# Libbacktrace provides line and column information on stack traces from errors. +# It is only supported on linux and macOS. +# Possible values: +# - AUTO: auto set according to system information and feasibility +# - ON: enable libbacktrace +# - OFF: disable libbacktrace +set(USE_LIBBACKTRACE AUTO) diff --git a/cmake/modules/Libbacktrace.cmake b/cmake/libs/Libbacktrace.cmake similarity index 100% rename from cmake/modules/Libbacktrace.cmake rename to cmake/libs/Libbacktrace.cmake diff --git a/cmake/modules/Logging.cmake b/cmake/modules/Logging.cmake new file mode 100644 index 0000000000000..91c0fd07b676b --- /dev/null +++ b/cmake/modules/Logging.cmake @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script configures the logging module and dependency on libbacktrace + +if("${USE_LIBBACKTRACE}" STREQUAL "AUTO") + if(CMAKE_SYSTEM_NAME MATCHES "Linux") + set(USE_LIBBACKTRACE ON) + else() + set(USE_LIBBACKTRACE OFF) + endif() + message(STATUS "Autoset: USE_LIBBACKTRACE=" ${USE_LIBBACKTRACE} " in " ${CMAKE_SYSTEM_NAME}) +endif() + + +if(USE_LIBBACKTRACE) + message(STATUS "Building with libbacktrace...") + include(cmake/libs/Libbacktrace.cmake) + target_link_libraries(tvm PRIVATE libbacktrace) + target_link_libraries(tvm_runtime PRIVATE libbacktrace) + add_dependencies(tvm_runtime_objs libbacktrace) + # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually + target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_compile_definitions(tvm_objs PRIVATE TVM_USE_LIBBACKTRACE=1) + target_compile_definitions(tvm_runtime_objs PRIVATE TVM_USE_LIBBACKTRACE=1) +else() + target_compile_definitions(tvm_objs PRIVATE TVM_USE_LIBBACKTRACE=0) + target_compile_definitions(tvm_runtime_objs PRIVATE TVM_USE_LIBBACKTRACE=0) +endif() diff --git a/conda/recipe/build.sh b/conda/recipe/build.sh index c9e76314da31e..828e3c39488a0 100755 --- a/conda/recipe/build.sh +++ b/conda/recipe/build.sh @@ -52,6 +52,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \ -DUSE_GRAPH_RUNTIME_DEBUG=ON \ -DUSE_LLVM=ON \ -DINSTALL_DEV=ON \ + -DUSE_LIBBACKTRACE=AUTO \ ${GPU_OPT} ${TOOLCHAIN_OPT} \ ${SRC_DIR} diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h index 952a5ffec6376..c5986a9baa151 100644 --- a/include/tvm/runtime/logging.h +++ b/include/tvm/runtime/logging.h @@ -30,6 +30,7 @@ #define TVM_RUNTIME_LOGGING_H_ #include +#include #include #include @@ -37,7 +38,22 @@ #include #include -#include "tvm/runtime/c_runtime_api.h" +/*! + * \brief Macro helper for exception throwing. + */ +#ifdef _MSC_VER +#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn) +#else +#define TVM_THROW_EXCEPTION noexcept(false) +#endif + +/*! + * \brief Whether or not use libbacktrace library + * for getting backtrace information + */ +#ifndef TVM_USE_LIBBACKTRACE +#define TVM_USE_LIBBACKTRACE 0 +#endif // a technique that enables overriding macro names on the number of parameters. This is used // to define other macros below @@ -124,19 +140,14 @@ #define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) #define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) -#ifdef _MSC_VER -#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn) -#else -#define TVM_THROW_EXCEPTION noexcept(false) -#endif - namespace tvm { namespace runtime { -/* \brief Generate a backtrace when called. +/*! + * \brief Generate a backtrace when called. * \return A multiline string of the backtrace. There will be either one or two lines per frame. */ -std::string Backtrace(); +TVM_DLL std::string Backtrace(); /*! \brief Base error type for TVM. Wraps a string message. */ class Error : public ::dmlc::Error { // for backwards compatibility diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc index 8a44ec04532c1..0d8577a4ccdf8 100644 --- a/src/runtime/logging.cc +++ b/src/runtime/logging.cc @@ -17,19 +17,7 @@ * under the License. */ -#ifdef TVM_BACKTRACE_DISABLED -#include - -// TODO(bkimball,tkonolige) This inline function is to work around a linking error I am having when -// using MSVC If the function definition is in logging.cc then the linker can't find it no matter -// what kind of attributes (dllexport) I decorate it with. This is temporary and will be addressed -// when we get backtrace working on Windows. -namespace tvm { -namespace runtime { -__declspec(dllexport) std::string Backtrace() { return ""; } -} // namespace runtime -} // namespace tvm -#else +#if TVM_USE_LIBBACKTRACE #include #include @@ -148,4 +136,18 @@ std::string Backtrace() { } } // namespace runtime } // namespace tvm + +#else + +#include + +#include + +namespace tvm { +namespace runtime { +// Fallback to the dmlc implementation when backtrace is not available. +std::string Backtrace() { return dmlc::StackTrace(); } +} // namespace runtime +} // namespace tvm + #endif diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index aa5581b0e11ab..2af91d7c6b8e3 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -45,3 +45,4 @@ echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake echo set\(USE_ETHOSN_HW OFF\) >> config.cmake echo set\(USE_VITIS_AI ON\) >> config.cmake echo set\(USE_VERILATOR ON\) >> config.cmake +echo set\(USE_LIBBACKTRACE ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index 13dfb4136547b..7338555c4c942 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -44,3 +44,4 @@ echo set\(USE_BLAS openblas\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake +echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh index 5865dc9699584..f12d0f99cb378 100755 --- a/tests/scripts/task_config_build_gpu_vulkan.sh +++ b/tests/scripts/task_config_build_gpu_vulkan.sh @@ -28,5 +28,6 @@ echo set\(USE_ROCM ON\) >> config.cmake echo set\(USE_VULKAN ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_PROFILER ON\) >> config.cmake +echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake From 27f108529ba962c44ab78df49b5c93fcd5caecdc Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Sat, 20 Mar 2021 01:53:54 -0700 Subject: [PATCH 49/84] [torch] Use try_infer_value for clamp min/max (#7712) --- python/tvm/relay/frontend/pytorch.py | 16 ++++++++++++++-- tests/python/frontend/pytorch/test_forward.py | 7 +++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index fd0a07e35c155..8ae1e862ffd52 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1688,8 +1688,20 @@ def pad(inputs, input_types): def clamp(self, inputs, input_types): data = inputs[0] - amin = inputs[1] if inputs[1] else np.finfo(np.float32).min - amax = inputs[2] if inputs[2] else np.finfo(np.float32).max + + def get_v(v, default_v): + if isinstance(v, _expr.Constant): + return float(v.data.asnumpy()) + if isinstance(v, _expr.Expr): + infer_v, success = try_infer_value(v, lambda ret: float(ret)) + if success: + return infer_v + if v is not None: + return v + return default_v + + amin = get_v(inputs[1], np.finfo(np.float32).min) + amax = get_v(inputs[2], np.finfo(np.float32).max) return _op.clip(data, amin, amax) def to(self, inputs, input_types): diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 83c1698799c73..d0edfd9c80364 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -2622,10 +2622,17 @@ class Clamp3(Module): def forward(self, *args): return torch.clamp(args[0], max=1.0) + class Clamp_MinExpr_MaxConstant(Module): + def forward(self, *args): + h, w = args[0].shape[2:] + amin = h / 100.0 + return torch.clamp(args[0], min=amin, max=w) + input_data = torch.rand(input_shape).float() verify_model(Clamp1().float().eval(), input_data=input_data) verify_model(Clamp2().float().eval(), input_data=input_data) verify_model(Clamp3().float().eval(), input_data=input_data) + verify_model(Clamp_MinExpr_MaxConstant().float().eval(), input_data=input_data) @tvm.testing.uses_gpu From fffed0ff91c46f5c45070b52794f4f2bf4d1b8a5 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Sun, 21 Mar 2021 04:22:53 +0800 Subject: [PATCH 50/84] [TensorIR] TVMScript Parser/Printer (#7630) Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Co-authored-by: Junru Shao Co-authored-by: Tianqi Chen Co-authored-by: Ruihang Lai Co-authored-by: Hongyi Jin <3231950289@qq.com> Co-authored-by: Wuwei Lin Co-authored-by: Tristan Konolige Co-authored-by: Cody Yu --- include/tvm/tir/analysis.h | 15 + python/tvm/script/context_maintainer.py | 210 +++++++- python/tvm/script/intrin.py | 20 +- python/tvm/script/node.py | 150 ++++++ python/tvm/script/parser.py | 179 ++++--- python/tvm/script/registry.py | 20 +- python/tvm/script/scope_handler.py | 473 +++++++++++++++--- python/tvm/script/special_stmt.py | 380 +++++++++++++- python/tvm/script/utils.py | 95 +++- python/tvm/tir/analysis/analysis.py | 23 + src/printer/tir_text_printer.cc | 3 +- src/printer/tvmscript_printer.cc | 232 ++++++++- .../analysis/block_access_region_detector.cc | 246 +++++++++ src/tir/ir/script/script_complete.cc | 122 +++++ ...st_tir_analysis_get_block_access_region.py | 57 +++ .../unittest/test_tvmscript_error_report.py | 205 ++++++++ .../unittest/test_tvmscript_roundtrip.py | 170 +++++++ tests/scripts/task_ci_python_setup.sh | 2 +- tests/scripts/task_ci_setup.sh | 2 +- 19 files changed, 2395 insertions(+), 209 deletions(-) create mode 100644 python/tvm/script/node.py create mode 100644 src/tir/analysis/block_access_region_detector.cc create mode 100644 src/tir/ir/script/script_complete.cc create mode 100644 tests/python/unittest/test_tir_analysis_get_block_access_region.py diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h index 1ad78596586a3..1692a8cdacf31 100644 --- a/include/tvm/tir/analysis.h +++ b/include/tvm/tir/analysis.h @@ -157,6 +157,21 @@ TVM_DLL bool VerifyMemory(const PrimFunc& func); */ TVM_DLL bool VerifyGPUCode(const PrimFunc& func, Map constraints); +/*! + * \brief Auto detect the block read/write region according to body stmt + * It will detect the read/write region as an array in order of appearance in AST + * \param block The block to be detected + * \param buffer_var_map The outside buffers which may be accessed the block. + * It is a map from buffer var to the buffer. + * \return Array of access regions. + * There are three arrays of BufferRegion: + * - first: read regions + * - second: write regions + * - third: opaque regions + */ +Array> GetBlockAccessRegion(const Block& block, + const Map& buffer_var_map); + // Pass variants of verification analysis // directly throws RuntimeError when verification fails. namespace transform { diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/context_maintainer.py index 955266c4a3e0f..ae3e9d885f1a2 100644 --- a/python/tvm/script/context_maintainer.py +++ b/python/tvm/script/context_maintainer.py @@ -16,59 +16,217 @@ # under the License. """TVM Script Context Maintainer for TIR""" -from tvm.te import schedule +from typing import List, Mapping, Union, Optional, Dict, Callable +import synr + + +import tvm +from tvm.ir import Span +from tvm.tir import Var, Buffer, PrimExpr, Stmt, MatchBufferRegion +from tvm.runtime import Object +from .node import BufferSlice + + +class BlockInfo: + """Information for block and block_realize signature + + Examples + ---------- + .. code-block:: python + + @tvm.script.tir + def example_func(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + B = tir.match_buffer(b, (16, 16), "float32") + C = tir.match_buffer(a, (16, 16), "float32") + + for i, j, k in tir.grid(16, 16, 16): + with tir.block([16, 16, tir.reduce_axis(16)], "matmul") as [vi, vj, vk]: + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, k) # iter_bindings = {vj: i, vj: j, vk: k} + + tir.where(True) # predicate of the block_realize + + tir.reads(A[0:16, 0:16], B[0: 16, 0: 16]) # reads region of the block + tir.writes(C[0: 16, 0: 16]) # writes region of the block + tir.block_attr({"attr_key": "attr_value"}) # block annotations + + # alloc_buffers inside the block + CC = tir.alloc_buffer((1, 1), dtype="float32") + + # match_buffers of the block, + # which bind a sub-region of source buffer into a new buffer + D = tir.match_buffer_region(C[vi, vj]) + + # init part of the block, executed when all reduce axes are the beginning value + with tir.init(): + C[vi, vj] = tir.float32(0) + + # block body + CC[0, 0] = A[vi, vk] * B[vj, vk] + D[0, 0] += CC[0, 0] # The same as C[vi, vj] += CC[0, 0] + """ + + alloc_buffers: List[Buffer] = [] + """List[Buffer]: list of tir.alloc_buffer statements in the block signature""" + match_buffers: List[MatchBufferRegion] = [] + """List[MatchBufferRegion]: list of tir.match_buffer_region statements in the block signature""" + iter_bindings: Mapping[Var, PrimExpr] = {} + """Mapping[Var, PrimExpr]: map of block iter var to its values""" + reads: Optional[List[BufferSlice]] = None + """Optional[List[BufferSlice]]: + list of tir.reads statements in the block signature, None for not-visited""" + writes: Optional[List[BufferSlice]] = None + """Optional[List[BufferSlice]]: + list of tir.writes statements in the block signature, None for not-visited""" + annotations: Optional[Mapping[str, Object]] = None + """Optional[Mapping[str, Object]]: + list of tir.block_attr statements in the block signature, None for not-visited""" + predicate: Optional[PrimExpr] = None + """Optional[PrimExpr]: block realize predicate, None for not-visited""" + init: Optional[Stmt] = None + """Optional[Stmt]: init part of the block, None for not-visited""" + + def __init__(self): + self.alloc_buffers = [] + self.match_buffers = [] + self.iter_bindings = {} + self.reads = None + self.writes = None + self.annotations = None + self.predicate = None + self.init = None class ContextMaintainer: - """Maintain all the necessary context info""" + """Maintain all the necessary context info + Parameters + ---------- + _report_error : Callable[[str, Union[Span, synr.ast.Span]], None] + The report error function handle + """ + + # scope context + node_stack: List[List[synr.ast.Node]] = [] + """List[List[synr.ast.Node]]: The ast nodes insides the current scope""" + block_info_stack: List[BlockInfo] = [] + """List[BlockInfo]: The block info for the current block scope""" + loop_stack: List[List[Var]] = [] + """List[List[Var]]: List of loop vars inside the current block scope""" + symbols: List[Dict[str, Union[Var, Buffer]]] = [] + """List[Dict[str, Union[Var, Buffer]]]: Symbol map from name to object for the current scope""" - def __init__(self, parser): + # function context + func_params: List[Var] = [] + """List[Var]: The function parameters""" + func_buffer_map: Mapping[Var, Buffer] = {} + """Mapping[Var, Buffer]: The function buffer map""" + func_dict_attr: Mapping[str, Object] = {} + """Mapping[str, Object]: The function attrs""" + func_var_env_dict: Mapping[Var, str] = {} + """Mapping[Var, str]: The map from var to env thread""" + + # parser and analyzer + analyzer: tvm.arith.Analyzer = tvm.arith.Analyzer() + """tvm.arith.Analyzer: The analyzer for simplifying""" + _report_error: Callable[[str, Union[Span, synr.ast.Span]], None] + """Callable[[str, Union[Span, synr.ast.Span]], None]: The report error function handle""" + + def __init__(self, _report_error: Callable[[str, Union[Span, synr.ast.Span]], None]): # scope context - self.node_stack = [] # AST nodes of scopes - self.symbols = [] # symbols of scopes + self.node_stack = [] + self.block_info_stack = [] + self.loop_stack = [] + self.symbols = [] # function context - self.func_params = [] # parameter list of function - self.func_buffer_map = {} # buffer_map of function - self.func_dict_attr = {} # func_attr of function - self.func_var_env_dict = {} # map from var to env_name - # parser - self.parser = parser - - def pop_scope(self): - """Pop the inner most scope""" - self.symbols.pop() - self.node_stack.pop() + self.func_params = [] + self.func_buffer_map = {} + self.func_dict_attr = {} + self.func_var_env_dict = {} + # parser and analyzer + self._report_error = _report_error + self.analyzer = tvm.arith.Analyzer() + + def enter_scope(self, nodes: Optional[List[synr.ast.Node]] = None): + """Creates a new scope - def new_scope(self, nodes=None): - """Creating a new scope""" + Note + ---- + This function is used for normal scopes that do not involve + a `with block` scope. Use `enter_block_scope` + for block scope cases. + + Parameters + ---------- + nodes : Optional[List[synr.ast.Node]] + The synr AST nodes in new scope + """ if nodes is None: nodes = [] self.node_stack.append(list(reversed(nodes))) self.symbols.append(dict()) - def update_symbol(self, name, symbol): + def enter_block_scope(self, nodes: Optional[List[synr.ast.Node]] = None): + """Creates a new block scope, the function will call `enter_scope` implicitly + Besides the behaviors of `enter_scope`, it will update loop_stack and block_info_stack + to maintain block info. + + Note + ---- + This function should be used to handle a block scope, + aka the blocks that involve a `with block` scope. + + Parameters + ---------- + nodes : Optional[List[synr.ast.Node]] + The synr AST nodes in new scope + """ + self.enter_scope(nodes) + # Create a new loop stack for the new block + self.loop_stack.append([]) + # Create a new BlockInfo for the new block + self.block_info_stack.append(BlockInfo()) + + def exit_scope(self): + """Pop the inner most scope""" + self.symbols.pop() + self.node_stack.pop() + + def exit_block_scope(self): + """Pop the inner most block scope, the function will call `exit_scope` implicitly""" + self.exit_scope() + # Pop loop stack + self.loop_stack.pop() + # Pop block_info + self.block_info_stack.pop() + + def update_symbol(self, name: str, symbol: Union[Buffer, Var], node: synr.ast.Node): """Append a symbol into current scope""" - if isinstance(symbol, schedule.Buffer): + if isinstance(symbol, Buffer): if name in self.symbols[0]: - self.parser.report_error("Duplicate Buffer name") + self.report_error("Duplicate Buffer name: " + symbol.name, node.span) self.symbols[0][name] = symbol else: self.symbols[-1][name] = symbol - def remove_symbol(self, name): + def remove_symbol(self, name: str): """Remove a symbol""" for symbols in reversed(self.symbols): if name in symbols: symbols.pop(name) return - raise RuntimeError("Internal error of tvm script parser: no symbol named" + name) + raise RuntimeError("Internal error of tvm script parser: no symbol named " + name) - def lookup_symbol(self, name): + def lookup_symbol(self, name: str) -> Optional[Union[Buffer, Var]]: """Look up symbol by name""" for symbols in reversed(self.symbols): if name in symbols: return symbols[name] return None - def report_error(self, message, span): - self.parser.report_error(message, span) + def report_error(self, message: str, span: Union[Span, synr.ast.Span]): + self._report_error(message, span) + + def current_block_scope(self) -> BlockInfo: + return self.block_info_stack[-1] diff --git a/python/tvm/script/intrin.py b/python/tvm/script/intrin.py index 053cd4a458462..48f50a2da442e 100644 --- a/python/tvm/script/intrin.py +++ b/python/tvm/script/intrin.py @@ -16,9 +16,11 @@ # under the License. """TVM Script Parser Intrinsic Classes""" # pylint: disable=redefined-builtin, relative-beyond-top-level +from typing import List, Any + import tvm.tir from .registry import register -from .utils import get_param_list, from_synr_span +from .utils import get_param_list, tvm_span_from_synr class Intrin: @@ -29,8 +31,8 @@ def __init__(self, intrin, stmt=False): def signature(self): return "tir." + self.intrin.__name__, get_param_list(self.intrin) - def handle(self, arg_list, span): - return self.intrin(*arg_list, span=from_synr_span(span)) + def handle(self, arg_list: List[Any], span: tvm.ir.Span): + return self.intrin(*arg_list, span=tvm_span_from_synr(span)) @register @@ -98,6 +100,16 @@ def float64(imm, span): return tvm.tir.Cast("float64", imm, span) +@register +def min_value(dtype, span): + return tvm.tir.min_value(dtype, span) + + +@register +def max_value(dtype, span): + return tvm.tir.max_value(dtype, span) + + @register def floordiv(x, y, span): return tvm.tir.floordiv(x, y, span) @@ -145,7 +157,7 @@ def get_axis(begin, end, iter_type, span): block_var_dom = tvm.ir.Range.from_min_extent(begin, extent) iter_type_dict = {"data_par": 0, "reduce": 2, "scan": 3, "opaque": 4} - return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type], span) + return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type], span=span) @register diff --git a/python/tvm/script/node.py b/python/tvm/script/node.py new file mode 100644 index 0000000000000..039eeb452ddb2 --- /dev/null +++ b/python/tvm/script/node.py @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=redefined-builtin +"""TVM Script nodes.""" + +from typing import Optional, Union, List, Callable +import synr + +from tvm.runtime import ObjectGeneric +from tvm.tir import PrimExpr, Buffer, BufferLoad +from tvm.ir import Span + + +class Slice: + """A helper class to present slice information for BufferSlice + + Parameters + ---------- + start : Union[PrimExpr, int] + The start index. + + stop : Optional[Union[PrimExpr, int]] + The stop index, None means the Slice is an element-wise index + + span : Optional[Span] + The location of the slice in the source. + """ + + start: Union[PrimExpr, int] + stop: Optional[Union[PrimExpr, int]] + span: Optional[Span] + + def __init__( + self, + start: Union[PrimExpr, int], + stop: Optional[Union[PrimExpr, int]] = None, + span: Optional[Span] = None, + ): + self.start = start + self.stop = stop + self.span = span + + +class BufferSlice(ObjectGeneric): + """A generic object for representing general buffer access. Following cases are supported: + - element wise access buffer[i, j], which can be converted to BufferLoad if necessary + - slice access buffer[i: i + 1, j : j + 2] + - union of element and slice buffer[i, j: j + 2] + + This node is used in TVMScript to parse BufferLoad, BufferRegion and Realize + + Parameters + ---------- + buffer : Buffer + The buffer. + + indices : List[Union[Slice, PrimExpr, int]] + The access indexes can be slice, PrimExpr or int. + + report_error: Callable[[str, Union[Span, synr.ast.Span]], None] + The error report func + + span : Optional[Span] + The location of the buffer access in the source. + """ + + buffer: Buffer + slices: List[Slice] + report_error: Callable[[str, Union[Span, synr.ast.Span]], None] + span: Optional[Span] + + def __init__( + self, + buffer: Buffer, + indices: List[Union[Slice, PrimExpr, int]], + report_error: Callable[[str, Union[Span, synr.ast.Span]], None], + span: Optional[Span] = None, + ): + def check_index(index: Union[int, PrimExpr]): + """ Check input index is non-negative integer or PrimExpr""" + if isinstance(index, int): + if index < 0: + report_error("Negative index is not allowed during buffer access", span) + elif isinstance(index, PrimExpr): + if index.dtype != "int32": + report_error( + "index expected an int32 type PrimExpr but got " + str(index.dtype), + index.span, + ) + else: + report_error( + "Unsupported index type, expected int or tvm.tir.PrimExpr, but got " + + str(type(index)), + span, + ) + + slices: List[Slice] = [] + for index in indices: + if isinstance(index, Slice): + check_index(index.start) + check_index(index.stop) + slices.append(index) + elif isinstance(index, (PrimExpr, int)): + check_index(index) + slices.append(Slice(index)) + else: + report_error( + "Unsupported index type for BufferSlice, " + + "expected int, tvm.tir.PrimExpr, tvm.tir.Slice, but got " + + str(type(index)), + span, + ) + + self.buffer = buffer + self.slices = slices + self.report_error = report_error + self.span = span + + def __str__(self): + regions: List[str] = [] + for s in self.slices: + if s.stop is None: + regions.append(str(s.start)) + else: + regions.append(str(s.start) + ": " + str(s.stop)) + + return self.buffer.name + "[" + ", ".join(regions) + "]" + + def asobject(self) -> BufferLoad: + """Convert object.""" + for s in self.slices: + if s.stop is not None: + self.report_error("BufferLoad only accepts elementwise access", self.span) + + indices = [s.start for s in self.slices] + return BufferLoad(self.buffer, indices, span=self.span) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index 33b0bab0d7e7e..8f6d338238b7a 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -24,6 +24,7 @@ import json import operator import inspect +from typing import Union from synr import ast, Transformer, to_ast import tvm @@ -32,6 +33,7 @@ from tvm.ir import GlobalVar from . import context_maintainer, ty +from .context_maintainer import BlockInfo from .meta_unparser import MetaUnparser from .registry import Registry from .intrin import Intrin @@ -39,7 +41,8 @@ from .scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler from . import _ffi_api from .diagnostics import TVMDiagnosticCtx -from .utils import from_synr_span +from .utils import tvm_span_from_synr, synr_span_from_tvm, call_with_error_reporting +from .node import Slice, BufferSlice class CallArgumentReader(object): @@ -158,7 +161,7 @@ def __init__(self, base_lienno): def init_function_parsing_env(self): """Initialize function parsing environment""" - self.context = context_maintainer.ContextMaintainer(self) # scope emitter + self.context = context_maintainer.ContextMaintainer(self.report_error) # scope emitter def init_meta(self, meta_dict): if meta_dict is not None: @@ -182,7 +185,7 @@ def transform(self, node): return transform_res - def report_error(self, message, span): + def report_error(self, message: str, span: Union[ast.Span, tvm.ir.Span]): """Report an error occuring at a location. This just dispatches to synr's DiagnosticContext. @@ -191,9 +194,11 @@ def report_error(self, message, span): ---------- message : str Error message - span : synr.ast.Span + span : Union[synr.ast.Span, tvm.ir.Span】 Location of the error """ + if isinstance(span, tvm.ir.Span): + span = synr_span_from_tvm(span) self.error(message, span) def parse_body(self, parent): @@ -221,7 +226,7 @@ def parse_body(self, parent): ) else: return ( - tvm.tir.SeqStmt(body, from_synr_span(ast.Span.union(spans))) + tvm.tir.SeqStmt(body, tvm_span_from_synr(ast.Span.union(spans))) if len(body) > 1 else body[0] ) @@ -270,6 +275,13 @@ def parse_arg_list(self, func, node_call): internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default)) if varargs is not None: internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1)) + elif len(args) + len(kw_args) > len(pos_only) + len(kwargs): + self.report_error( + "Arguments mismatched. " + + f"Expected {len(pos_only) + len(kwargs)} args but got " + + f"{len(args) + len(kw_args)}", + node_call.span, + ) return internal_args def parse_type(self, type_node, parent): @@ -401,25 +413,52 @@ def my_function(x: ty.handle): # 1. Argument types """ self.init_function_parsing_env() - self.context.new_scope(nodes=node.body.stmts) + self.context.enter_scope(nodes=node.body.stmts) # add parameters of function for arg in node.params: arg_var = tvm.te.var(arg.name, self.parse_type(arg.ty, arg)) - self.context.update_symbol(arg.name, arg_var) + self.context.update_symbol(arg.name, arg_var, node) self.context.func_params.append(arg_var) - # fetch the body and return a tir.PrimFunc + # New Scope : Implicit root block + # Each function contains an implicit root block in TensorIR, + # so here we need a block scope for it. Please note that `enter_block_scope` + # will not create a block directly but just stores some information. + # If the PrimFunc is not a TensorIR func (e.g. TE scheduled func or low-level func), + # the root block will not be added. The logic to add root block is in `_ffi_api.Complete` + self.context.enter_block_scope(nodes=node.body.stmts) + + # fetch the body of root block + body = self.parse_body(node.body) + # Emit Scope : Implicit root block + root_info: BlockInfo = self.context.current_block_scope() + self.context.exit_block_scope() + + # return a tir.PrimFunc + dict_attr = self.context.func_dict_attr func = tvm.tir.PrimFunc( self.context.func_params, - self.parse_body(node.body), + body, ret_type=self.parse_type(node.ret_type, node), buffer_map=self.context.func_buffer_map, - attrs=tvm.ir.make_node("DictAttrs", **self.context.func_dict_attr), - span=from_synr_span(node.span), + attrs=tvm.ir.make_node("DictAttrs", **dict_attr) if dict_attr else None, + span=tvm_span_from_synr(node.span), + ) + + # Fix the PrimFunc + # 1. generate root block if necessary + # 2. generate surrounding loops for blocks if necessary + + func = call_with_error_reporting( + self.report_error, + node.span, + _ffi_api.Complete, + func, + root_info.alloc_buffers, ) - self.context.pop_scope() + self.context.exit_scope() return func def transform_Assign(self, node): @@ -470,12 +509,12 @@ def transform_Assign(self, node): var = tvm.te.var( node.lhs.id.name, self.parse_type(node.ty, node.lhs), - span=from_synr_span(node.lhs.span), + span=tvm_span_from_synr(node.lhs.span), ) - self.context.update_symbol(var.name, var) + self.context.update_symbol(var.name, var, node) body = self.parse_body(node) self.context.remove_symbol(var.name) - return tvm.tir.LetStmt(var, value, body, span=from_synr_span(node.span)) + return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span)) self.report_error("Unsupported Assign stmt", node.span) @@ -484,28 +523,28 @@ def transform_SubscriptAssign(self, node): symbol = self.transform(node.params[0]) indexes = self.transform(node.params[1]) rhs = self.transform(node.params[2]) - rhs_span = from_synr_span(node.params[2].span) + rhs_span = tvm_span_from_synr(node.params[2].span) if isinstance(symbol, tvm.tir.Buffer): # BufferStore return tvm.tir.BufferStore( symbol, tvm.runtime.convert(rhs, span=rhs_span), indexes, - span=from_synr_span(node.span), + span=tvm_span_from_synr(node.span), ) else: if len(indexes) != 1: self.report_error( f"Store is only allowed with one index, but {len(indexes)} were provided.", - Span.union([x.span for x in indexes]), + tvm.ir.Span.union([x.span for x in indexes]), ) # Store return tvm.tir.Store( symbol, tvm.runtime.convert(rhs, span=rhs_span), indexes[0], - tvm.runtime.convert(True, span=from_synr_span(node.span)), - span=from_synr_span(node.span), + tvm.runtime.convert(True, span=tvm_span_from_synr(node.span)), + span=tvm_span_from_synr(node.span), ) def transform_Assert(self, node): @@ -520,7 +559,7 @@ def transform_Assert(self, node): message = self.transform(node.msg) body = self.parse_body(node) return tvm.tir.AssertStmt( - condition, tvm.runtime.convert(message), body, span=from_synr_span(node.span) + condition, tvm.runtime.convert(message), body, span=tvm_span_from_synr(node.span) ) def transform_For(self, node): @@ -529,7 +568,8 @@ def transform_For(self, node): For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) By now 1 pattern of For is supported: 1. for scope handler - for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll() + for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()/tir.range()/ + tir.grid()/tir.thread_binding() """ if not isinstance(node.rhs, ast.Call): @@ -543,14 +583,14 @@ def transform_For(self, node): old_lineno, old_col_offset = self.current_lineno, self.current_col_offset self.current_lineno = node.span.start_line self.current_col_offset = node.span.start_column - self.context.new_scope(nodes=node.body.stmts) + self.context.enter_scope(nodes=node.body.stmts) # for scope handler process the scope arg_list = self.parse_arg_list(func, node.rhs) func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span) func.body = self.parse_body(node) res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span) # exit the scope - self.context.pop_scope() + self.context.exit_scope() self.current_lineno, self.current_col_offset = old_lineno, old_col_offset return res @@ -561,9 +601,9 @@ def transform_With(self, node): withitem = (expr context_expr, expr? optional_vars) By now 2 patterns of With is supported: 1. with scope handler with symbol def - with tir.allocate() as targets: + with tir.block(*axes)/tir.allocate() as targets: 2. with scope handler without symbol def - with tir.let()/tir.Assert()/tir.attr()//tir.realize() + with tir.let()/tir.Assert()/tir.attr()/tir.realize() """ if not isinstance(node.rhs, ast.Call): @@ -582,14 +622,14 @@ def transform_With(self, node): old_lineno, old_col_offset = self.current_lineno, self.current_col_offset self.current_lineno = node.body.span.start_line self.current_col_offset = node.body.span.start_column - self.context.new_scope(nodes=node.body.stmts) + self.context.enter_block_scope(nodes=node.body.stmts) # with scope handler process the scope arg_list = self.parse_arg_list(func, node.rhs) func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span) func.body = self.parse_body(node) res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span) # exit the scope - self.context.pop_scope() + self.context.exit_block_scope() self.current_lineno, self.current_col_offset = old_lineno, old_col_offset return res @@ -601,19 +641,21 @@ def transform_If(self, node): condition = self.transform(node.condition) # then body - self.context.new_scope(nodes=node.true.stmts) + self.context.enter_scope(nodes=node.true.stmts) then_body = self.parse_body(node) - self.context.pop_scope() + self.context.exit_scope() # else body if len(node.false.stmts) > 0: - self.context.new_scope(nodes=node.false.stmts) + self.context.enter_scope(nodes=node.false.stmts) else_body = self.parse_body(node) - self.context.pop_scope() + self.context.exit_scope() else: else_body = None - return tvm.tir.IfThenElse(condition, then_body, else_body, span=from_synr_span(node.span)) + return tvm.tir.IfThenElse( + condition, then_body, else_body, span=tvm_span_from_synr(node.span) + ) def transform_Call(self, node): """Call visitor @@ -633,18 +675,26 @@ def transform_Call(self, node): lhs = self.transform(node.params[0]) rhs = self.transform(node.params[1]) return self._binop_maker[node.func_name.name]( - lhs, rhs, span=from_synr_span(node.span) + lhs, rhs, span=tvm_span_from_synr(node.span) ) if node.func_name.name in self._unaryop_maker: rhs = self.transform(node.params[0]) - return self._unaryop_maker[node.func_name.name](rhs, span=from_synr_span(node.span)) + return self._unaryop_maker[node.func_name.name]( + rhs, span=tvm_span_from_synr(node.span) + ) self.report_error(f"Unsupported operator {node.func_name.name}.", node.func_name.span) else: func = self.transform(node.func_name) if isinstance(func, Intrin) and not func.stmt: # pattern 1 arg_list = self.parse_arg_list(func, node) - return func.handle(arg_list, node.func_name.span) + return call_with_error_reporting( + self.report_error, + node.func_name.span, + func.handle, + arg_list, + node.func_name.span, + ) else: args = [self.transform(arg) for arg in node.params] kw_args = { @@ -653,7 +703,7 @@ def transform_Call(self, node): if isinstance(func, tvm.tir.op.Op): # pattern 2 return tvm.tir.Call( - kw_args["dtype"], func, args, span=from_synr_span(node.span) + kw_args["dtype"], func, args, span=tvm_span_from_synr(node.span) ) elif callable(func): # pattern 3 @@ -700,7 +750,13 @@ def f(): ) if isinstance(func, Intrin) and func.stmt: - return func.handle(arg_list, node.call.func_name.span) + return call_with_error_reporting( + self.report_error, + node.call.func_name.span, + func.handle, + arg_list, + node.call.func_name.span, + ) elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol: func.enter_scope(node, self.context, arg_list, node.call.func_name.span) func.body = self.parse_body(node) @@ -716,11 +772,7 @@ def transform_Slice(self, node): end = self.transform(node.end) if not (isinstance(node.step, ast.Constant) and node.step.value == 1): self.report_error("Only step size 1 is supported for slices.", node.step.span) - extent = end - start - if isinstance(extent, tvm.tir.PrimExpr): - ana = tvm.arith.Analyzer() - extent = ana.simplify(extent) - return tvm.ir.Range.from_min_extent(start, extent, span=from_synr_span(node.span)) + return Slice(start, end) def transform_Subscript(self, node): """Array access visitor. @@ -728,7 +780,7 @@ def transform_Subscript(self, node): By now only 2 types of Subscript are supported: 1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore) Var[index] Buffer element access() - 2. meta[type_key][index], Meta info access + 2. Buffer[start: stop, start: stop, ...], BufferRealize(realize(buffer[...])) """ symbol = self.transform(node.params[0]) @@ -736,19 +788,27 @@ def transform_Subscript(self, node): self.report_error(f"Variable {node.value.id} is not defined.", node.params[0].span) indexes = [self.transform(x) for x in node.params[1].values] - if isinstance(indexes[0], tvm.ir.Range): - return symbol, indexes - if isinstance(symbol, tvm.tir.expr.Var): - return tvm.tir.Load("float32", symbol, indexes, True, span=from_synr_span(node.span)) - if isinstance(symbol, tvm.tir.Buffer): - return tvm.tir.BufferLoad(symbol, indexes, span=from_synr_span(node.span)) - - self.report_error( - f"Cannot subscript from a {type(symbol).__name__}. Only variables and " - "buffers are supported.", - node.params[0].span, - ) + for index in indexes: + if not isinstance(index, (tvm.tir.PrimExpr, int)): + self.report_error( + "Buffer load indexes should be int or PrimExpr, but they are " + + type(index), + node.span, + ) + return tvm.tir.Load( + "float32", symbol, indexes, True, span=tvm_span_from_synr(node.span) + ) + elif isinstance(symbol, tvm.tir.Buffer): + return BufferSlice( + symbol, indexes, self.report_error, span=tvm_span_from_synr(node.span) + ) + else: + self.report_error( + f"Cannot subscript from a {type(symbol).__name__}. Only variables and " + "buffers are supported.", + node.params[0].span, + ) def transform_Attr(self, node): """Visitor for field access of the form `x.y`. @@ -756,7 +816,7 @@ def transform_Attr(self, node): This visitor is used to lookup function and symbol names. We have two cases to handle here: 1. If we have a statement of the form `tir.something`, then we lookup - `tir.somthing` in the `Registry`. If the function is not in the + `tir.something` in the `Registry`. If the function is not in the registry, then we try to find a `tvm.ir.op.Op` with the same name. 2. All other names `tvm.something` are lookup up in this current python namespace. @@ -875,7 +935,7 @@ def transform_Constant(self, node): Constant values include `None`, `"strings"`, `2` (integers), `4.2` (floats), and `true` (booleans). """ - return tvm.runtime.convert(node.value, span=from_synr_span(node.span)) + return tvm.runtime.convert(node.value, span=tvm_span_from_synr(node.span)) def transform_TypeConstant(self, node): """Constant value visitor for types. @@ -902,8 +962,7 @@ def from_source(src): ---------- src : [str, function, class] Pruned source of original script - func_lineno : Optional[int] - The line number of the first line of the script to be parsed + Returns ------- functions : PrimFunc or IRModule diff --git a/python/tvm/script/registry.py b/python/tvm/script/registry.py index 389570115935c..245cc01051d52 100644 --- a/python/tvm/script/registry.py +++ b/python/tvm/script/registry.py @@ -16,7 +16,8 @@ # under the License. """TVM Script Parser Function Registry """ # pylint: disable=inconsistent-return-statements, relative-beyond-top-level, import-outside-toplevel -import inspect +import types +from typing import Union, Callable, Dict, Optional, Any class Registry(object): @@ -24,10 +25,10 @@ class Registry(object): All these maps are static """ - registrations = dict() + registrations: Dict[str, type] = dict() @staticmethod - def lookup(name): + def lookup(name: str) -> Optional[Any]: if name in Registry.registrations: # every time we create a new handler # since we may want to keep some local info inside it @@ -35,12 +36,14 @@ def lookup(name): return None -def register(inputs): +def register(inputs: Union[Callable, type]) -> type: """Register Intrin/ScopeHandler/SpecialStmt""" - if inspect.isfunction(inputs): + registration: type + if isinstance(inputs, types.FunctionType): + # is function from .intrin import Intrin - def create_new_intrin(func): + def create_new_intrin(func) -> type: class NewIntrin(Intrin): def __init__(self): super().__init__(func) @@ -48,11 +51,12 @@ def __init__(self): return NewIntrin registration = create_new_intrin(inputs) - elif inspect.isclass(inputs): + elif isinstance(inputs, type): + # is class registration = inputs else: raise ValueError() - key = registration().signature()[0] + key: str = registration().signature()[0] Registry.registrations[key] = registration return registration diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py index 9449cbdc156c3..c7d841abc36d5 100644 --- a/python/tvm/script/scope_handler.py +++ b/python/tvm/script/scope_handler.py @@ -16,32 +16,59 @@ # under the License. """TVM Script Parser Scope Handler Classes""" # pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level +from typing import Tuple, Any, Callable, Optional, List, Union, Mapping +import synr from synr import ast import tvm.tir -from .utils import get_param_list, from_synr_span +from tvm.runtime import Object +from tvm.ir import Span, Range +from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind + +from .context_maintainer import ContextMaintainer +from .utils import ( + get_param_list, + tvm_span_from_synr, + buffer_slice_to_region, + call_with_error_reporting, +) from .registry import register +from .node import BufferSlice class ScopeHandler: """Base class for all scope handlers""" - def __init__(self, func): - self.func = func - self.body = None - self.node = None - self.context = None + def __init__(self, func: Callable): + self.func: Callable = func + self.body: Optional[Stmt] = None + self.node: Optional[synr.ast.Node] = None + self.context: Optional[ContextMaintainer] = None - def signature(self): + def signature(self) -> Tuple[str, Tuple[list, list, Any]]: return "tir." + self.func.__name__, get_param_list(self.func) - def enter_scope(self, node, context, arg_list, span): + def enter_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): pass - def exit_scope(self, node, context, arg_list, span): + def exit_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): self.node = node self.context = context - return self.func(*arg_list, span=from_synr_span(span)) + return call_with_error_reporting( + context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span) + ) class WithScopeHandler(ScopeHandler): @@ -55,24 +82,29 @@ def __init__(self, func, concise_scope, def_symbol): @staticmethod def get_optional_var_names(node, context): """Get list of names from ast.With's optional_vars""" - assert isinstance(node, ast.With) - - var_names = None - if isinstance(node.items[0].optional_vars, ast.Name): - var_names = [node.items[0].optional_vars.id] - elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)): - for var in node.items[0].optional_vars.elts: - if not isinstance(var, ast.Name): - context.report_error("Invalid optional var definition") - var_names = [var.id for var in node.items[0].optional_vars.elts] + assert isinstance( + node, ast.With + ), f"WithScopeHandler expected ast.With but got {type(node)}" + + if isinstance(node.lhs, list): + for var in node.lhs: + if not isinstance(var, ast.Var): + context.report_error( + f"Invalid optional var definition, expected Var but got {type(var)}", + node.span, + ) + var_names = [var.id.name for var in node.lhs] else: - context.report_error("Invalid optional var definition") + context.report_error( + f"Invalid optional var definition, expected list of Var but got {type(node.lhs)}", + node.span, + ) return var_names @register class Allocate(WithScopeHandler): - """ With scope handler tir.alloc_with_scope(var, extents, dtype, scope, condition) """ + """ With scope handler tir.allocate(extents, dtype, scope, condition) """ def __init__(self): def allocate(extents, dtype, scope, condition=True, span=None): @@ -86,7 +118,13 @@ def allocate(extents, dtype, scope, condition=True, span=None): super().__init__(allocate, concise_scope=True, def_symbol=True) self.buffer_var = None - def enter_scope(self, node, context, arg_list, span): + def enter_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): # define buffer vars in symbol table if isinstance(node, ast.With): names = WithScopeHandler.get_optional_var_names(node, context) @@ -98,13 +136,13 @@ def enter_scope(self, node, context, arg_list, span): else: raise Exception("Internal Bug") - def setup_buffer_var(extents, dtype, scope, condition=True, span=None): + def setup_buffer_var(extents, dtype, scope, condition=True, span: Span = None): """Setup buffer var for a given type.""" buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype)) self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span) - setup_buffer_var(*arg_list, span=from_synr_span(node.lhs.id.span)) - context.update_symbol(name, self.buffer_var) + setup_buffer_var(*arg_list, span=tvm_span_from_synr(node.lhs.id.span)) + context.update_symbol(name, self.buffer_var, node) @register @@ -115,10 +153,10 @@ def __init__(self): def launch_thread(env_var, extent, span): extent = tvm.runtime.convert(extent, span=span) return tvm.tir.AttrStmt( - tvm.tir.IterVar( + IterVar( None, env_var, - getattr(tvm.tir.IterVar, "ThreadIndex"), + getattr(IterVar, "ThreadIndex"), self.context.func_var_env_dict[env_var], span=span, ), @@ -136,8 +174,19 @@ class Realize(WithScopeHandler): """ With scope handler tir.realize(buffer_bounds, scope, condition) """ def __init__(self): - def realize(buffer_bounds, scope, condition=True, span=None): - buffer, bounds = buffer_bounds + def realize( + buffer_slice: BufferSlice, scope: str, condition: bool = True, span: bool = None + ): + assert self.context, "call 'exit_scope' before 'enter_scope'" + buffer: Buffer = buffer_slice.buffer + bounds: List[Range] = [] + for s in buffer_slice.slices: + min: Union[PrimExpr, int] = s.start + extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start + if isinstance(extent, PrimExpr): + extent = self.context.analyzer.simplify(extent) + bounds.append(Range.from_min_extent(min, extent, span=s.span)) + scope = tvm.runtime.convert(scope, span=span) return tvm.tir.AttrStmt( buffer, @@ -185,92 +234,380 @@ def let(var, value, span): super().__init__(let, concise_scope=False, def_symbol=False) +@register +class Block(WithScopeHandler): + """ With scope handler tir.block(extents, name) as iter_vars""" + + def __init__(self): + def block(axes=None, name_hint: str = "", span: Optional[Span] = None): + assert ( + self.node and self.context and self.body + ), "call 'exit_scope' before 'enter_scope'" + block_info = self.context.block_info_stack[-1] + if axes is None: + axes = [] + if len(axes) != len(self.block_vars): + self.context.report_error( + "Inconsistent number of block vars, " + + f"there are {len(axes)} axes but {len(self.block_vars)} block vars. " + + "The number of block vars should match the number of axes.", + self.node.span, + ) + block_iters: List[IterVar] = [] + for i, axis in enumerate(axes): + axis = tvm.runtime.convert(axis) + if isinstance(axis, tvm.tir.PrimExpr): + block_var_dom = Range.from_min_extent(0, axis) + block_iters.append(IterVar(block_var_dom, self.block_vars[i], 0)) + elif isinstance(axis, Range): + block_iters.append(IterVar(axis, self.block_vars[i], 0)) + elif isinstance(axis, IterVar): + block_iters.append(IterVar(axis.dom, self.block_vars[i], axis.iter_type)) + else: + self.context.report_error( + "Invalid argument of tir.block(), " + + f"expected PrimExpr, Range or IterVar, but got {type(axis)}", + self.node.span, + ) + + # create block read/write regions + + reads: List[BufferRegion] = ( + [buffer_slice_to_region(read) for read in block_info.reads] + if block_info.reads + else [] + ) + writes: List[BufferRegion] = ( + [buffer_slice_to_region(write) for write in block_info.writes] + if block_info.writes + else [] + ) + inner = tvm.tir.Block( + block_iters, + reads, + writes, + name_hint, + self.body, + block_info.init, + block_info.alloc_buffers, + block_info.match_buffers, + block_info.annotations, + span, + ) + # create block var iter binding + values: List[PrimExpr] + if not block_info.iter_bindings: + values = self.context.loop_stack[-2].copy() + if len(values) == 0: + values = [tvm.tir.const(float("nan"), dtype="float32")] * len(block_iters) + elif len(values) != len(block_iters): + self.context.report_error( + "Number of block iter var and outer loop nesting mismatch, " + + f"{len(block_iters)} block iter vars but {len(values)} loops", + self.node.span, + ) + else: + for block_var in self.block_vars: + if block_var not in block_info.iter_bindings: + self.context.report_error( + "Missing block iter var binding for " + block_var.name, + self.node.span, + ) + values = [block_info.iter_bindings[block_var] for block_var in self.block_vars] + predicate = ( + tvm.tir.const(True, "bool") + if block_info.predicate is None + else block_info.predicate + ) + body = tvm.tir.BlockRealize(values, predicate, inner, span) + return body + + super().__init__(func=block, concise_scope=False, def_symbol=True) + self.block_vars = None + + def enter_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): + # define block vars + assert isinstance( + node, ast.With + ), f"BlockScopeHandler expected to work on ast.With but got {type(node)}" + + var_names = WithScopeHandler.get_optional_var_names(node, context) + self.block_vars = [tvm.te.var(name) for name in var_names] + for block_var in self.block_vars: + context.update_symbol(block_var.name, block_var, node) + + +@register +class InitBlock(WithScopeHandler): + """ With scope handler tir.init()""" + + def __init__(self): + def init(span: Span = None): + assert self.context, "call 'exit_scope' before 'enter_scope'" + if self.context.block_info_stack[-2].init is not None: + self.context.report_error("Duplicate init block declaration", span) + self.context.block_info_stack[-2].init = self.body + + super().__init__(func=init, concise_scope=False, def_symbol=True) + + class ForScopeHandler(ScopeHandler): """Base class for all for scope handlers""" def __init__(self, func): super().__init__(func) - self.loop_vars = None + self.loop_vars: Optional[List[Var]] = None - def enter_scope(self, node, context, arg_list, span): - assert isinstance(node, ast.For) + def enter_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): + assert isinstance(node, ast.For), f"ForScopeHandler expected ast.For but got {type(node)}" loop_var_names = list() spans = list() if isinstance(node.lhs, ast.Var): loop_var_names.append(node.lhs.id.name) - spans.append(from_synr_span(node.lhs.id.span)) - elif isinstance(node.lhs, ast.Tuple): - for elt in node.lhs.values: + spans.append(tvm_span_from_synr(node.lhs.id.span)) + elif isinstance(node.lhs, list): + for elt in node.lhs: if not isinstance(elt, ast.Var): - context.report_error("Invalid loop var", elt.span) + context.report_error( + f"Invalid loop var. Expected a var, but got {type(elt)}", elt.span + ) loop_var_names.append(elt.id.name) - spans.append(from_synr_span(elt.id.span)) + spans.append(tvm_span_from_synr(elt.id.span)) else: - context.report_error("Invalid loop var", node.lhs.span) + context.report_error( + f"Invalid loop var. Expected var or list of vars as lhs, but got {type(node.lhs)}", + span, + ) self.loop_vars = [ tvm.te.var(name, dtype="int32", span=span) for name, span in zip(loop_var_names, spans) ] for loop_var in self.loop_vars: - context.update_symbol(loop_var.name, loop_var) + context.update_symbol(loop_var.name, loop_var, node) + context.loop_stack[-1].append(loop_var) + + def exit_scope( + self, + node: synr.ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): + assert self.loop_vars, "call 'exit_scope' before 'enter_scope'" + for _ in self.loop_vars: + context.loop_stack[-1].pop() + return super().exit_scope(node, context, arg_list, span) + + def create_loop( + self, + begin: PrimExpr, + end: PrimExpr, + kind: ForKind, + thread_binding: Optional[str] = None, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ) -> tvm.tir.For: + """ + Helper function for creating For in TVM Script parser. + + Parameters + ---------- + begin : PrimExpr + The beginning value. + + end : PrimExpr + The endding value. + + kind : ForKind + The type of the for. + + thread_binding: Optional[str] + The thread this loop binds to. + + annotations : Optional[Mapping[str, Object]] + Additional annotation hints. + + span : Optional[Span] + The location of this for in the source code. + + Returns + ------- + for : For + The constructed For. + """ + assert ( + self.loop_vars and self.context and self.node + ), "call 'exit_scope' before 'enter_scope'" + if len(self.loop_vars) != 1: + self.context.report_error( + f"Expected exactly one loop var, but got {self.loop_vars}", self.node.span + ) + extent = end if begin == 0 else self.context.analyzer.simplify(end - begin) + annos: Mapping[str, Object] = {} + if annotations is not None: + annos = { + key: tvm.tir.StringImm(val) if isinstance(val, str) else val + for key, val in annotations.items() + } + return tvm.tir.For( + self.loop_vars[0], + begin, + extent, + kind, + self.body, + thread_binding=thread_binding, + annotations=annos, + span=span, + ) @register class Serial(ForScopeHandler): - """ For scope handler tir.serial(begin, end)""" + """ For scope handler tir.serial(begin, end, annotations)""" def __init__(self): - def serial(begin, end, span): - if len(self.loop_vars) != 1: - self.context.report_error("Expect exact 1 loop var", span) - ana = tvm.arith.Analyzer() - extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 0, self.body, span=span) + def serial( + begin: PrimExpr, + end: PrimExpr, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span) super().__init__(serial) @register class Parallel(ForScopeHandler): - """ For scope handler tir.parallel(begin, end)""" + """ For scope handler tir.parallel(begin, end, annotations)""" def __init__(self): - def parallel(begin, end, span): - if len(self.loop_vars) != 1: - self.context.report_error("Expect exact 1 loop var") - ana = tvm.arith.Analyzer() - extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 1, self.body, span=span) + def parallel( + begin: PrimExpr, + end: PrimExpr, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + return self.create_loop( + begin, end, ForKind.PARALLEL, annotations=annotations, span=span + ) super().__init__(parallel) @register class Vectorized(ForScopeHandler): - """ For scope handler tir.vectorized(begin, end)""" + """ For scope handler tir.vectorized(begin, end, annotations)""" def __init__(self): - def vectorized(begin, end, span): - if len(self.loop_vars) != 1: - self.context.report_error("Expect exact 1 loop var") - ana = tvm.arith.Analyzer() - extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 2, self.body, span=span) + def vectorized( + begin: PrimExpr, + end: PrimExpr, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + return self.create_loop( + begin, end, ForKind.VECTORIZED, annotations=annotations, span=span + ) super().__init__(vectorized) @register class Unroll(ForScopeHandler): - """ For scope handler tir.unroll(begin, end)""" + """ For scope handler tir.unroll(begin, end, annotations)""" def __init__(self): - def unroll(begin, end, span): - if len(self.loop_vars) != 1: - self.context.report_error("Expect exact 1 loop var") - ana = tvm.arith.Analyzer() - extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 3, self.body, span=span) + def unroll( + begin: PrimExpr, + end: PrimExpr, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + return self.create_loop( + begin, end, ForKind.UNROLLED, annotations=annotations, span=span + ) super().__init__(unroll) + + +@register +class ThreadBinding(ForScopeHandler): + """ For scope handler tir.thread_binding(begin, end, thread, annotations)""" + + def __init__(self): + def thread_binding( + begin: PrimExpr, + end: PrimExpr, + thread: str, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread, span=span) + return self.create_loop( + begin, + end, + ForKind.THREAD_BINDING, + thread_binding=thread_iter_var, + annotations=annotations, + span=span, + ) + + super().__init__(thread_binding) + + +@register +class RangeHandler(ForScopeHandler): + """For scope handler range(begin, end, annotations) + Note that tir.range is totally the same as tir.serial + """ + + def __init__(self): + def for_range( + begin: PrimExpr, + end: PrimExpr, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span) + + super().__init__(for_range) + + def signature(self): + return "range", get_param_list(self.func) + + +@register +class Grid(ForScopeHandler): + """ For scope handler tir.grid(extents)""" + + def __init__(self): + def grid(*extents: List[PrimExpr], span: Span): + assert ( + self.node and self.context and self.loop_vars + ), "call 'exit_scope' before 'enter_scope'" + if len(self.loop_vars) != len(extents): + self.context.report_error( + "Inconsistent number of loop vars and extents, " + + f"got {len(self.loop_vars)} vs {len(extents)}", + self.node.span, + ) + body = self.body + for loop_var, extent in zip(reversed(self.loop_vars), reversed(extents)): + body = tvm.tir.For(loop_var, 0, extent, ForKind.SERIAL, body, span=span) + return body + + super().__init__(grid) diff --git a/python/tvm/script/special_stmt.py b/python/tvm/script/special_stmt.py index 62ce1ea19d893..6aa1239e9d790 100644 --- a/python/tvm/script/special_stmt.py +++ b/python/tvm/script/special_stmt.py @@ -17,30 +17,81 @@ """TVM Script Parser Special Stmt Classes""" # pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements # pylint: disable=relative-beyond-top-level +from typing import Callable, List, Optional, Tuple, Any, Mapping, Union + +import synr from synr import ast import tvm.tir +from tvm.runtime import Object from tvm import te -from .utils import get_param_list, from_synr_span +from tvm.ir import Span +from tvm.tir import IntImm +from .utils import ( + get_param_list, + tvm_span_from_synr, + buffer_slice_to_region, + call_with_error_reporting, +) from .registry import register +from .context_maintainer import ContextMaintainer +from .node import BufferSlice + + +def convert_to_int( + value: Union[IntImm, int], + arg_name: str, + report_error: Callable, + span: Union[Span, synr.ast.Span], +) -> int: + """convert a const int or TVM IntImm to Python int. + Reports an error when input cannot be converted to int. + + Parameters + ---------- + value : Union[tvm.tir.IntImm, int] + The input value to be converted. + arg_name : str + Function argument name for error reporting. + report_error: Callable + The report error function handle + span : Union[synr.ast.Span, tvm.ir.Span] + Location of the error + """ + if isinstance(value, IntImm): + return value.value + if isinstance(value, int): + return value + report_error( + f"Expected int or IntImm for {arg_name}, but got {str(type(value))}", + span, + ) class SpecialStmt: """Base class for all Special Stmts""" - def __init__(self, func, def_symbol): - self.func = func - self.def_symbol = def_symbol - self.node = None - self.context = None + def __init__(self, func: Callable, def_symbol: bool): + self.func: Callable = func + self.def_symbol: bool = def_symbol + self.node: Optional[synr.ast.Node] = None + self.context: Optional[ContextMaintainer] = None - def signature(self): + def signature(self) -> Tuple[str, Tuple[list, list, Any]]: return "tir." + self.func.__name__, get_param_list(self.func) - def handle(self, node, context, arg_list, span): + def handle( + self, + node: ast.Node, + context: ContextMaintainer, + arg_list: List[Any], + span: synr.ast.Span, + ): self.node = node self.context = context - return self.func(*arg_list, span=from_synr_span(span)) + return call_with_error_reporting( + context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span) + ) @register @@ -67,17 +118,20 @@ def match_buffer( buffer_type="default", span=None, ): - assert isinstance(self.node, ast.Assign) - + if not isinstance(self.node, ast.Assign): + self.context.report_error( + "match_buffer must be assigned to a buffer, e.g. A = match_buffer(...)", + self.node.span, + ) if param not in self.context.func_params: self.context.report_error( "Can not bind non-input param to buffer", self.node.rhs.params[0].span ) if strides is None: strides = [] - align = align.value if not isinstance(align, int) else align - offset_factor = ( - offset_factor.value if not isinstance(offset_factor, int) else offset_factor + align = convert_to_int(align, "align", self.context.report_error, self.node.span) + offset_factor = convert_to_int( + offset_factor, "offset_factor", self.context.report_error, self.node.span ) buffer = tvm.tir.decl_buffer( shape, @@ -93,7 +147,7 @@ def match_buffer( span=span, ) self.context.func_buffer_map[param] = buffer - self.context.update_symbol(self.node.lhs.id.name, buffer) + self.context.update_symbol(self.node.lhs.id.name, buffer, self.node) super().__init__(match_buffer, def_symbol=True) @@ -121,13 +175,17 @@ def buffer_decl( buffer_type="default", span=None, ): - assert isinstance(self.node, ast.Assign) + if not isinstance(self.node, ast.Assign): + self.context.report_error( + "buffer_decl must be assigned to a buffer, e.g. A = buffer_decl(...)", + self.node.span, + ) if strides is None: strides = [] - align = align.value if not isinstance(align, int) else align - offset_factor = ( - offset_factor.value if not isinstance(offset_factor, int) else offset_factor + align = convert_to_int(align, "align", self.context.report_error, self.node.span) + offset_factor = convert_to_int( + offset_factor, "offset_factor", self.context.report_error, self.node.span ) buffer = tvm.tir.decl_buffer( shape, @@ -142,21 +200,293 @@ def buffer_decl( buffer_type, span=span, ) - self.context.update_symbol(self.node.lhs.id.name, buffer) + self.context.update_symbol(self.node.lhs.id.name, buffer, self.node) return buffer super().__init__(buffer_decl, def_symbol=True) +@register +class AllocBuffer(SpecialStmt): + """Special function alloc_buffer(shape, dtype, data, strides, elem_offset, scope, align, + offset_factor, buffer_type) + + Example + ------- + .. code-block:: python + + A = tir.alloc_buffer((128, 128), dtype="float32") + """ + + def __init__(self): + def alloc_buffer( + shape, + dtype="float32", + data=None, + strides=None, + elem_offset=None, + scope="", + align=-1, + offset_factor=0, + buffer_type="default", + span=None, + ): + if not isinstance(self.node, ast.Assign): + self.context.report_error( + "alloc_buffer must be assigned to a buffer, e.g. A = alloc_buffer(...)", + self.node.span, + ) + + if strides is None: + strides = [] + align = convert_to_int(align, "align", self.context.report_error, self.node.span) + offset_factor = convert_to_int( + offset_factor, "offset_factor", self.context.report_error, self.node.span + ) + buffer = tvm.tir.decl_buffer( + shape, + dtype, + self.node.lhs.id.name, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + span=span, + ) + self.context.current_block_scope().alloc_buffers.append(buffer) + self.context.update_symbol(self.node.lhs.id.name, buffer, self.node) + + super().__init__(alloc_buffer, def_symbol=True) + + +@register +class BlockVarBind(SpecialStmt): + """Special function bind(block_iter, binding_value) + + Example + ------- + .. code-block:: python + + tir.bind(vx, i) + """ + + def __init__(self): + def bind(iter_var, values, span=None): + block_scope = self.context.current_block_scope() + if iter_var in block_scope.iter_bindings: + self.context.report_error("Duplicate iter_var bindings of " + str(iter_var), span) + block_scope.iter_bindings[iter_var] = values + + super().__init__(bind, def_symbol=False) + + +@register +class BlockReads(SpecialStmt): + """Special function reads([read_buffer_regions]) + + Example + ------- + .. code-block:: python + + tir.reads([A[vi: vi + 4, vk: vk + 4], B[vk: vk + 4, vj]]) + """ + + def __init__(self): + def reads(read_regions: Union[BufferSlice, List[BufferSlice]], span: Span = None): + assert self.context, "call 'exit_scope' before 'enter_scope'" + block_scope = self.context.current_block_scope() + if block_scope.reads is not None: + self.context.report_error( + "Duplicate write region declaration, " + + "previous one is " + + str(", ".join(str(x) for x in block_scope.reads)), + span, + ) + if isinstance(read_regions, BufferSlice): + read_regions = [read_regions] + if not isinstance(read_regions, list): + self.context.report_error( + "Incorrect input type. " + + f"Expected BufferSlice or List[BufferSlice], but got {type(read_regions)}", + span, + ) + block_scope.reads = read_regions + + super().__init__(reads, def_symbol=False) + + +@register +class BlockWrites(SpecialStmt): + """Special function writes([write_buffer_regions]) + + Example + ------- + .. code-block:: python + + tir.writes([C[vi: vi + 4, vj]) + """ + + def __init__(self): + def writes(write_region: Union[BufferSlice, List[BufferSlice]], span: Span = None): + assert self.context, "call 'exit_scope' before 'enter_scope'" + block_scope = self.context.current_block_scope() + if block_scope.writes is not None: + self.context.report_error( + "Duplicate write region declaration, " + + "previous one is " + + str(", ".join(str(x) for x in block_scope.writes)), + span, + ) + if isinstance(write_region, list): + pass + elif isinstance(write_region, BufferSlice): + write_region = [write_region] + else: + self.context.report_error( + "Incorrect input type. " + + f"Expected BufferSlice or List[BufferSlice], but got {type(write_region)}", + span, + ) + block_scope.writes = write_region + + super().__init__(writes, def_symbol=False) + + +@register +class BlockAttr(SpecialStmt): + """Special function block_attr({attr_key: attr_value}) + + Example + ------- + .. code-block:: python + + tir.block_attr({"double_buffer_scope": 1}) + """ + + def __init__(self): + def block_attr(attrs: Mapping[str, Object], span: Span = None): + assert self.context, "call 'exit_scope' before 'enter_scope'" + block_scope = self.context.current_block_scope() + if block_scope.annotations is not None: + self.context.report_error( + "Duplicate block annotations declaration, " + + "previous one is " + + str(block_scope.annotations), + span, + ) + attrs = { + key: tvm.tir.StringImm(val) if isinstance(val, str) else val + for key, val in attrs.items() + } + block_scope.annotations = attrs + + super().__init__(block_attr, def_symbol=False) + + +@register +class BlockPredicate(SpecialStmt): + """Special function where(predicate) + + Example + ------- + .. code-block:: python + + tir.where(i < 4) + """ + + def __init__(self): + def where(predicate, span=None): + assert self.context, "call 'exit_scope' before 'enter_scope'" + block_scope = self.context.current_block_scope() + if block_scope.predicate is not None: + self.context.report_error( + "Duplicate block predicate declaration, " + + "previous one is " + + str(block_scope.predicate), + span, + ) + + block_scope.predicate = predicate + + super().__init__(where, def_symbol=False) + + +@register +class BlockMatchBufferRegion(SpecialStmt): + """Special function match_buffer_region(source, strides, elem_offset, align, offset_factor) + + Example + ------- + .. code-block:: python + + B = tir.match_buffer_region(A[0: 4]) + """ + + def __init__(self): + def match_buffer_region( + source, + strides=None, + elem_offset=None, + align=-1, + offset_factor=0, + span=None, + ): + assert self.context, "call 'exit_scope' before 'enter_scope'" + if not isinstance(self.node, ast.Assign): + self.context.report_error( + "match_buffer_region must be assigned to a buffer, " + + "e.g. A = match_buffer_region(...)", + self.node.span, + ) + + if strides is None: + strides = [] + align = convert_to_int(align, "align", self.context.report_error, self.node.span) + offset_factor = convert_to_int( + offset_factor, "offset_factor", self.context.report_error, self.node.span + ) + + if not isinstance(source, BufferSlice): + self.context.report_error( + "match_buffer_region needs a buffer region as source", + span=span, + ) + buffer_region = buffer_slice_to_region(source) + shape = [r.extent for r in buffer_region.region] + buffer = tvm.tir.decl_buffer( + shape, + buffer_region.buffer.dtype, + self.node.lhs.id.name, + data=None, + strides=strides, + elem_offset=elem_offset, + scope=buffer_region.buffer.scope, + data_alignment=align, + offset_factor=offset_factor, + span=span, + ) + self.context.current_block_scope().match_buffers.append( + tvm.tir.MatchBufferRegion(buffer, buffer_region) + ) + self.context.update_symbol(self.node.lhs.id.name, buffer, self.node) + + super().__init__(match_buffer_region, def_symbol=True) + + @register class VarDef(SpecialStmt): """ Special function for defining a Var""" def __init__(self): def var(dtype, span): - assert isinstance(self.node, ast.Assign) + assert isinstance( + self.node, ast.Assign + ), f"VarDef expected ast.Assign but got {type(self.node)}" v = te.var(self.node.lhs.id.name, dtype, span=span) - self.context.update_symbol(v.name, v) + self.context.update_symbol(v.name, v, self.node) super().__init__(var, def_symbol=True) @@ -167,10 +497,12 @@ class EnvThread(SpecialStmt): def __init__(self): def env_thread(env_name, span): - assert isinstance(self.node, ast.Assign) + assert isinstance( + self.node, ast.Assign + ), f"EnvThread expected ast.Assign but got {type(self.node)}" v = te.var(self.node.lhs.id.name, span=span) self.context.func_var_env_dict[v] = env_name - self.context.update_symbol(v.name, v) + self.context.update_symbol(v.name, v, self.node) super().__init__(env_thread, def_symbol=True) diff --git a/python/tvm/script/utils.py b/python/tvm/script/utils.py index a6ba9d087aa6b..f8a0f610d477f 100644 --- a/python/tvm/script/utils.py +++ b/python/tvm/script/utils.py @@ -16,15 +16,32 @@ # under the License. """Helper functions in TVM Script Parser""" +from typing import Callable, List, Any, Optional, Tuple, Union + import inspect -from ..ir import Span, SourceName +import synr + +from tvm.arith import Analyzer +from tvm.ir import Range, Span, SourceName +from tvm.tir import PrimExpr, BufferRegion +from tvm.error import DiagnosticError +from .node import BufferSlice -def get_param_list(func): +def get_param_list( + func: Callable, +) -> Tuple[List[str], List[Tuple[str, Tuple[Any, ...]]], Optional[str]]: """Get the parameter list from definition of function""" - full_arg_spec = inspect.getfullargspec(func) + full_arg_spec: inspect.FullArgSpec = inspect.getfullargspec(func) - args, defaults = full_arg_spec.args, full_arg_spec.defaults + args: List[str] + defaults: Optional[Tuple[Any, ...]] + kwonlyargs: List[str] + args, defaults, kwonlyargs = ( + full_arg_spec.args, + full_arg_spec.defaults, + full_arg_spec.kwonlyargs, + ) if defaults is None: defaults = tuple() @@ -33,14 +50,17 @@ def get_param_list(func): raise RuntimeError( "TVM Script register error : variable keyword argument is not supported now" ) - if not len(full_arg_spec.kwonlyargs) == 0: + + if len(kwonlyargs) == 1 and kwonlyargs[0] == "span": + pass + elif not len(kwonlyargs) == 0: raise RuntimeError("TVM Script register error : keyword only argument is not supported now") - pos_only = list() + pos_only: List[str] = list() for arg in args[: len(args) - len(defaults)]: if arg != "span": pos_only.append(arg) - kwargs = list() + kwargs: List[Tuple[str, Tuple[Any, ...]]] = list() for default, arg in zip(defaults, args[len(args) - len(defaults) :]): if arg != "span": kwargs.append((arg, default)) @@ -48,7 +68,37 @@ def get_param_list(func): return pos_only, kwargs, full_arg_spec.varargs -def from_synr_span(span): +def buffer_slice_to_region( + buffer_slice: BufferSlice, analyzer: Optional[Analyzer] = None +) -> BufferRegion: + """Construct BufferRegion from BufferSlice + + Parameters + ---------- + buffer_slice : BufferSlice + The input BufferSlice + + analyzer : Optional[tvm.arith.Analyzer] + The analyzer for simplifying. If not provided, the method will construct a new one + + Returns + ------- + buffer_region : BufferRegion + The constructed BufferRegion. + """ + region: List[Range] = [] + for s in buffer_slice.slices: + start: Union[PrimExpr, int] = s.start + extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start + if not analyzer: + analyzer = Analyzer() + if isinstance(extent, PrimExpr): + extent = analyzer.simplify(extent) + region.append(Range.from_min_extent(start, extent, span=s.span)) + return BufferRegion(buffer_slice.buffer, region) + + +def tvm_span_from_synr(span: synr.ast.Span) -> Span: """Convert a synr span to a TVM span""" return Span( SourceName(span.filename), @@ -57,3 +107,32 @@ def from_synr_span(span): span.start_column, span.end_column, ) + + +def synr_span_from_tvm(span: Span) -> synr.ast.Span: + """Convert a TVM span to a synr span""" + return synr.ast.Span( + span.source_name.name, + span.line, + span.column, + span.end_line, + span.end_column, + ) + + +def call_with_error_reporting( + report_error, + node_span, + func, + *args, + **kwargs, +): + """Call function with exception handling and report error using node_span""" + try: + return func(*args, **kwargs) + except DiagnosticError: + raise + except Exception as err: # pylint: disable=broad-except + # printing last non-empty row of error message. + error_msg = list(filter(None, str(err).split("\n")))[-1] + report_error(error_msg, node_span) diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py index 1a3eb48066773..829eb8bbdedba 100644 --- a/python/tvm/tir/analysis/analysis.py +++ b/python/tvm/tir/analysis/analysis.py @@ -106,3 +106,26 @@ def verify_gpu_code(func, constraints): The result of verification. """ return _ffi_api.verify_gpu_code(func, constraints) + + +def get_block_access_region(block, buffer_var_map): + """Detect which regions of tensors in this block are read or written to. + Regions are sorted by order of appearance in the AST. + + Parameters + ---------- + block: tvm.tir.Block + The block in which we are detecting read/write regions. + + buffer_var_map : Dict[Var, Buffer] + The outside buffers which may access the block. Mapping from buffer var to the buffer + + Returns + ------- + result : List[List[BufferRegion]] + Array of access regions. There are three arrays of BufferRegion: + - first: read regions + - second: write regions + - third: opaque regions + """ + return _ffi_api.get_block_access_region(block, buffer_var_map) diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index 8d5bba5e5bb09..788074073c082 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -476,8 +476,7 @@ inline const char* ForKind2String(ForKind t) { case ForKind::kUnrolled: return "unroll"; case ForKind::kThreadBinding: - LOG(FATAL) << "Loop ThreadBinding is reserved for future used and " - << "not yet supported in TIR"; + return "thread_binding"; } LOG(FATAL) << "Unknown ForKind"; return "Unknown"; diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 86b175e1676c1..4380795023067 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -22,6 +22,7 @@ * \brief Printer class to print Tensor IR to python syntax script */ +#include #include #include #include @@ -66,7 +67,10 @@ class TVMScriptPrinter : public StmtFunctor, std::unordered_map func2var_; /*! \brief var collector (var defined by For/Loop/Block) */ std::unordered_set var_not_in_headers; - /*! \brief buffer collector (buffer defined in BufferMap and BufferAllocation)*/ + /*! + * \brief buffer collector + * (buffer defined in BufferMap, BufferAllocation and MatchBufferRegion) + */ std::unordered_set buf_not_in_headers; /*! \brief Map from Var to thread env name */ std::unordered_map var_env_map_; @@ -84,6 +88,8 @@ class TVMScriptPrinter : public StmtFunctor, int num_child_; /*! \brief the number of current node */ int current_num_; + /*! \brief loop stack without annotations */ + std::vector loop_stack_; Doc VisitExpr_(const CastNode* op) override; Doc VisitExpr_(const VarNode* op) override; @@ -131,6 +137,7 @@ class TVMScriptPrinter : public StmtFunctor, Doc VisitStmt_(const ForNode* op) override; Doc VisitStmt_(const PrefetchNode* op) override; Doc VisitStmt_(const EvaluateNode* op) override; + Doc VisitStmt_(const BlockRealizeNode* op) override; Doc VisitStmtDefault_(const Object* op) override; Doc VisitType_(const PrimTypeNode* node) override; @@ -145,12 +152,24 @@ class TVMScriptPrinter : public StmtFunctor, Doc PrintArray(const ArrayNode* op); Doc PrintBuffer(const BufferNode* op); Doc AllocBufferDeclaration(const Buffer& buf); + Doc PrintBufferRegion(const BufferRegionNode* op); + Doc PrintMatchBufferRegion(const MatchBufferRegionNode* op); + Doc PrintAnnotations(const Map& annotations); static Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); } Doc GetUniqueName(std::string prefix); Doc AllocVar(const Var& var); Doc AllocBuf(const Buffer& buffer); + /*! Helper functions for loop printing. */ + /*! + * \brief Print a single for loop + * \param loop The for loop to be printed + */ + Doc PrintLoop(const For& loop); + /*! \brief Print all simple loops in stack into one line using tir.grid(). */ + Doc PrintLoopStack(); + /*! * \brief Print additional info about expr in comment. * \param expr The expression. @@ -308,6 +327,36 @@ Doc TVMScriptPrinter::AllocBuf(const Buffer& buffer) { return val; } +Doc TVMScriptPrinter::PrintMatchBufferRegion(const MatchBufferRegionNode* op) { + const Buffer& buf = op->buffer; + buf_not_in_headers.insert(buf.get()); + + Doc doc = Print(op->buffer) << " = tir.match_buffer_region(" << Print(op->source); + if (!buf->strides.empty()) { + doc << ", strides=" << Print(buf->strides); + } + if (buf->offset_factor != 0 && buf->elem_offset->IsInstance()) { + Var elem_offset = Downcast(buf->elem_offset); + if (memo_var_.find(elem_offset) != memo_var_.end()) { + doc << ", elem_offset=" << Print(buf->elem_offset); + } else { + // implicitly define elem_offset + memo_var_[elem_offset] = Doc::Text(memo_buf_[buf].str() + ".elem_offset"); + var_not_in_headers.insert(elem_offset.get()); + } + } else { + doc << ", elem_offset=" << Print(buf->elem_offset); + } + if (buf->data_alignment != -1) { + doc << ", align=" << buf->data_alignment; + } + if (buf->offset_factor != 0) { + doc << ", offset_factor=" << buf->offset_factor; + } + doc << ")"; + return doc; +} + Doc TVMScriptPrinter::Print(const ObjectRef& node) { if (!node.defined()) return Doc::Text("None"); if (node->IsInstance()) { @@ -330,6 +379,10 @@ Doc TVMScriptPrinter::Print(const ObjectRef& node) { return PrintIterVar(node.as()); } else if (node->IsInstance()) { return PrintRange(node.as()); + } else if (node->IsInstance()) { + return PrintBufferRegion(node.as()); + } else if (node->IsInstance()) { + return PrintMatchBufferRegion(node.as()); } else { meta_collector_.Collect(node); return this->meta_.GetMetaNode(node); @@ -660,9 +713,7 @@ inline const char* ForKind2String(ForKind t) { case ForKind::kUnrolled: return "unroll"; case ForKind::kThreadBinding: - LOG(FATAL) << "Loop ThreadBinding is reserved for future used and " - << "not yet supported in TIR"; - return "threadbinding"; + return "thread_binding"; } LOG(FATAL) << "Unknown ForKind"; return "Unknown"; @@ -671,9 +722,27 @@ inline const char* ForKind2String(ForKind t) { Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) { Doc doc; var_not_in_headers.insert(op->loop_var.get()); - doc << "for " << Print(op->loop_var) << " in tir." + std::string(ForKind2String(op->kind)) + "(" - << Print(op->min) << ", " << Print(op->min + op->extent) - << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); + const auto* body = op->body.as(); + bool simple_loop = op->kind == ForKind::kSerial && op->annotations.empty() && is_zero(op->min); + if (simple_loop) loop_stack_.push_back(GetRef(op)); + // It is a loop that can be compressed, let the loops below print it out + if (simple_loop && body != nullptr) return Print(GetRef(body)); + // It is a loop that can not be compressed + bool print_above = !loop_stack_.empty(); + // print loops above if needed + if (print_above) { + doc << PrintLoopStack(); + loop_stack_.clear(); + } + if (!simple_loop) { + // print current loop if needed + Doc current_loop; + current_loop << PrintLoop(GetRef(op)); + current_loop << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); + doc << (print_above ? Doc::Indent(4, Doc::NewLine() << current_loop) : current_loop); + } else { + doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); + } return doc; } @@ -713,6 +782,88 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferStoreNode* op) { return doc; } +Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) { + const auto* block_op = op->block.as(); + // print block name and block vars + Doc doc; + doc << "with tir.block(["; + std::vector block_var_docs; + for (const auto& iter_var : block_op->iter_vars) { + Doc block_var_doc; + if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) { + block_var_doc << Print(iter_var->dom->extent); + } else { + block_var_doc << "tir."; + switch (iter_var->iter_type) { + case kDataPar: + block_var_doc << "range"; + break; + case kCommReduce: + block_var_doc << "reduce_axis"; + break; + case kOrdered: + block_var_doc << "scan_axis"; + break; + case kOpaque: + block_var_doc << "opaque_axis"; + break; + default: + LOG(FATAL) << "Unknown block var iter type: " << iter_var->iter_type; + break; + } + block_var_doc << "(" << Print(iter_var->dom->min) << ", " + << Print(iter_var->dom->min + iter_var->dom->extent) << ")"; + } + block_var_docs.push_back(block_var_doc); + } + doc << PrintSep(block_var_docs, Doc::Text(", ")) << "], "; + doc << Doc::StrLiteral(block_op->name_hint) << ")"; + std::vector block_var_names; + for (const auto& iter_var : block_op->iter_vars) { + var_not_in_headers.insert(iter_var->var.get()); + block_var_names.push_back(Print(iter_var->var)); + } + if (!block_var_names.empty()) { + doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]"; + } + doc << ":"; + Doc block_attr_doc; + // print predicate, binding, read/write tensor region, annotations + if (!is_one(op->predicate)) { + block_attr_doc << Doc::NewLine() << "tir.where(" << Print(op->predicate) << ")"; + } + for (size_t i = 0; i < block_op->iter_vars.size(); ++i) + block_attr_doc << Doc::NewLine() << "tir.bind(" << Print(block_op->iter_vars[i]->var) << ", " + << Print(op->iter_values[i]) << ")"; + block_attr_doc << Doc::NewLine() << "tir.reads(" << Print(block_op->reads) << ")"; + block_attr_doc << Doc::NewLine() << "tir.writes(" << Print(block_op->writes) << ")"; + if (!block_op->annotations.empty()) { + block_attr_doc << Doc::NewLine() << "tir.block_attr({"; + block_attr_doc << PrintAnnotations(block_op->annotations); + block_attr_doc << "})"; + } + // print body + Doc body; + body << Doc::NewLine(); + for (const auto& alloc_buf : block_op->alloc_buffers) { + buf_not_in_headers.insert(alloc_buf.get()); + body << Print(alloc_buf) << " = tir.alloc_buffer(" << memo_buf_decl_[alloc_buf] << ")" + << Doc::NewLine(); + } + for (const auto& match_buf : block_op->match_buffers) { + body << Print(match_buf) << Doc::NewLine(); + } + if (block_op->init.defined()) { + Doc init_block; + init_block << "with tir.init():"; + init_block << Doc::Indent(4, Doc::NewLine() << PrintBody(block_op->init.value())); + body << init_block << Doc::NewLine(); + } + body << PrintBody(block_op->body); + doc << Doc::Indent(4, block_attr_doc << body); + return doc; +} + Doc TVMScriptPrinter::PrintBody(const Stmt& body) { int memo_num_child, memo_current_num; std::swap(memo_num_child, num_child_); @@ -890,6 +1041,73 @@ Doc TVMScriptPrinter::PrintBuffer(const BufferNode* op) { return meta_.InMeta(buffer) ? meta_.GetMetaNode(buffer) : AllocBuf(buffer); } +Doc TVMScriptPrinter::PrintBufferRegion(const BufferRegionNode* op) { + Doc doc; + doc << Print(op->buffer) << "["; + for (size_t i = 0; i < op->region.size(); ++i) { + if (i != 0) doc << ", "; + const auto& range = op->region[i]; + if (!is_one(range->extent)) { + doc << Print(range->min) << ":" << Print(range->min + range->extent); + } else { + doc << Print(range->min); + } + } + doc << "]"; + return doc; +} + +Doc TVMScriptPrinter::PrintAnnotations(const Map& annotations) { + Doc res; + std::vector> anno_list; + anno_list.reserve(annotations.size()); + for (const auto& pair : annotations) { + anno_list.emplace_back(pair); + } + sort(anno_list.begin(), anno_list.end()); + for (size_t i = 0; i < anno_list.size(); ++i) { + if (i != 0) { + res << ", "; + } + res << "\"" << anno_list[i].first << "\":" << Print(anno_list[i].second); + } + return res; +} + +Doc TVMScriptPrinter::PrintLoop(const For& loop) { + Doc res; + res << "for " << Print(loop->loop_var) + << " in tir." + std::string(ForKind2String(loop->kind)) + "(" << Print(loop->min) << ", " + << Print(loop->min + loop->extent); + if (loop->thread_binding.defined()) { + res << ", thread = "; + res << Print(loop->thread_binding.value()->thread_tag); + } + if (!loop->annotations.empty()) { + res << ", annotation = {"; + res << PrintAnnotations(loop->annotations); + res << "}"; + } + res << "):"; + return res; +} + +Doc TVMScriptPrinter::PrintLoopStack() { + Doc res; + if (loop_stack_.size() == 1) { + res << PrintLoop(loop_stack_[0]); + } else if (loop_stack_.size() > 1) { + std::vector vars, extents; + for (const auto& loop : loop_stack_) { + vars.push_back(Print(loop->loop_var)); + extents.push_back(Print(loop->extent)); + } + res << "for " << PrintSep(vars, Doc::Text(", ")) << " in tir.grid(" + << PrintSep(extents, Doc::Text(", ")) << "):"; + } + return res; +} + TVM_REGISTER_GLOBAL("script.AsTVMScript") .set_body_typed([](const ObjectRef& functions, bool show_meta) { diff --git a/src/tir/analysis/block_access_region_detector.cc b/src/tir/analysis/block_access_region_detector.cc new file mode 100644 index 0000000000000..b1da536f1dade --- /dev/null +++ b/src/tir/analysis/block_access_region_detector.cc @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/analysis/block_region_detector.cc + * \brief Detect block read/write regions by visiting its body + */ + +#include +#include +#include + +namespace tvm { +namespace tir { + +/*! + * \brief Detect which regions of tensors in this block are read or written to. Regions are sorted + * by order of appearance in the AST. \note This detector can only visit blocks and will not visit + * child blocks recursively + */ +class BlockReadWriteDetector : public StmtExprVisitor { + public: + explicit BlockReadWriteDetector(const Map& buffer_var_map) + : buffer_var_map_(buffer_var_map) {} + + /*! \brief Return read regions of the block */ + Array CollectReads(); + /*! \brief Return write regions of the block */ + Array CollectWrites(); + /*! + * \brief Return opaque buffer regions of the block + * \note The buffer accessed by load/store or call with buffer.data will + * be marked as opaque. + */ + Array CollectOpaques(); + /*! \brief overload operator() to make sure it accepts a block node */ + void operator()(const Stmt& stmt); + + private: + /*! \brief Iteration range for loop_vars */ + std::unordered_map dom_map_; + /*! \brief The buffers that the current block reads */ + std::vector read_buffers_; + /*! \brief The buffers that the current block writes */ + std::vector writes_buffers_; + /*! \brief The opaque buffer which is access by buffer.data */ + std::vector opaque_buffers_; + /*! \brief The read regions of the current block */ + std::vector> read_regions_; + /*! \brief The write regions of the current block */ + std::vector> write_regions_; + /*! \brief The outside buffer data mapping to its buffer */ + Map buffer_var_map_; + /*! \brief The analyzer for simplifying*/ + arith::Analyzer analyzer_; + + /*! + * \brief Update read/write buffers and regions with provided buffer and region + * \param buffers The buffers should be updated + * \param regions The access regions should be updated + * \param buffer The provided buffer + * \param region The provided region + */ + void Update(std::vector* buffers, std::vector>* regions, + const Buffer& buffer, const std::vector& region); + + /*! \brief Helper function to collect access regions. */ + Array CollectRegions(const std::vector& buffers, + const std::vector>& regions); + + /*! \brief Helper function to add a opaque buffer. */ + void AddOpaque(const Var& buffer_var); + + void VisitStmt_(const ForNode* op) override; + void VisitStmt_(const BlockRealizeNode* op) override; + void VisitStmt_(const BufferStoreNode* op) override; + void VisitStmt_(const StoreNode* op) override; + void VisitExpr_(const BufferLoadNode* op) override; + void VisitExpr_(const LoadNode* op) override; + void VisitExpr_(const VarNode* op) override; +}; + +void BlockReadWriteDetector::operator()(const Stmt& stmt) { + ICHECK(stmt.as() != nullptr) + << "Only visiting Blocks is allowed, but got " << stmt->GetTypeKey(); + StmtExprVisitor::operator()(stmt); +} + +Array BlockReadWriteDetector::CollectReads() { + return CollectRegions(read_buffers_, read_regions_); +} + +Array BlockReadWriteDetector::CollectWrites() { + return CollectRegions(writes_buffers_, write_regions_); +} + +Array BlockReadWriteDetector::CollectOpaques() { + Array res; + res.reserve(opaque_buffers_.size()); + for (const Buffer& buffer : opaque_buffers_) { + res.push_back(BufferRegion::FullRegion(buffer)); + } + return res; +} + +void BlockReadWriteDetector::VisitExpr_(const VarNode* op) { AddOpaque(GetRef(op)); } + +void BlockReadWriteDetector::VisitExpr_(const LoadNode* op) { + AddOpaque(op->buffer_var); + ExprVisitor::VisitExpr_(op); +} + +void BlockReadWriteDetector::VisitExpr_(const BufferLoadNode* op) { + std::vector relaxed_region; + for (const PrimExpr& index : op->indices) { + relaxed_region.push_back(arith::EvalSet(index, dom_map_)); + } + Update(&read_buffers_, &read_regions_, op->buffer, relaxed_region); + ExprVisitor::VisitExpr_(op); +} + +void BlockReadWriteDetector::VisitStmt_(const ForNode* op) { + Range range = Range::FromMinExtent(op->min, op->extent); + dom_map_[op->loop_var.get()] = arith::IntSet::FromRange(range); + StmtVisitor::VisitStmt_(op); + dom_map_.erase(op->loop_var.get()); +} + +void BlockReadWriteDetector::VisitStmt_(const StoreNode* op) { + AddOpaque(op->buffer_var); + StmtVisitor::VisitStmt_(op); +} + +void BlockReadWriteDetector::VisitStmt_(const BufferStoreNode* op) { + std::vector relaxed_region; + for (const PrimExpr& index : op->indices) { + relaxed_region.push_back(arith::EvalSet(index, dom_map_)); + } + Update(&writes_buffers_, &write_regions_, op->buffer, relaxed_region); + StmtVisitor::VisitStmt_(op); +} + +void BlockReadWriteDetector::VisitStmt_(const BlockRealizeNode* op) { + /*! \note detector will not visit child block recursively, so it will stop here */ + std::unordered_map vmap; + for (size_t i = 0; i < op->block->iter_vars.size(); ++i) { + vmap[op->block->iter_vars[i]->var.get()] = op->iter_values[i]; + } + for (const auto& read : op->block->reads) { + std::vector relaxed_region; + for (const auto& range : read->region) { + relaxed_region.push_back( + arith::EvalSet(arith::IntSet::FromRange(Range::FromMinExtent( + Substitute(range->min, vmap), Substitute(range->extent, vmap))), + dom_map_)); + } + Update(&read_buffers_, &read_regions_, read->buffer, relaxed_region); + } + for (const auto& write : op->block->writes) { + std::vector relaxed_region; + for (const auto& range : write->region) { + relaxed_region.push_back( + arith::EvalSet(arith::IntSet::FromRange(Range::FromMinExtent( + Substitute(range->min, vmap), Substitute(range->extent, vmap))), + dom_map_)); + } + Update(&writes_buffers_, &write_regions_, write->buffer, relaxed_region); + } +} + +void BlockReadWriteDetector::Update(std::vector* buffers, + std::vector>* regions, + const Buffer& buffer, + const std::vector& region) { + if (buffer_var_map_.find(buffer->data) == buffer_var_map_.end()) return; + ICHECK_EQ(buffers->size(), regions->size()) + << " Expected the buffer and regions to have the same size "; + for (size_t i = 0; i < regions->size(); ++i) { + if ((*buffers)[i].same_as(buffer)) { + ICHECK_EQ((*regions)[i].size(), region.size()) << "Inconsistent buffer dimension"; + for (size_t j = 0; j < region.size(); ++j) { + (*regions)[i][j] = arith::Union({(*regions)[i][j], region[j]}); + } + return; + } + } + buffers->push_back(buffer); + regions->push_back(region); +} + +Array BlockReadWriteDetector::CollectRegions( + const std::vector& buffers, + const std::vector>& regions) { + ICHECK_EQ(buffers.size(), regions.size()); + Array res; + res.reserve(buffers.size()); + for (size_t i = 0; i < regions.size(); ++i) { + Array region; + region.reserve(regions[i].size()); + for (size_t j = 0; j < regions[i].size(); j++) { + tvm::arith::IntSet range = regions[i][j]; + region.push_back(range.CoverRange(Range::FromMinExtent(0, buffers[i]->shape[j]))); + } + res.push_back(BufferRegion(buffers[i], region)); + } + return res; +} + +void BlockReadWriteDetector::AddOpaque(const Var& buffer_var) { + auto it = buffer_var_map_.find(buffer_var); + if (it != buffer_var_map_.end()) { + const Buffer& buffer = (*it).second; + for (const Buffer& opaque_buffer : opaque_buffers_) { + if (buffer.same_as(opaque_buffer)) return; + } + opaque_buffers_.push_back(buffer); + } +} + +Array> GetBlockAccessRegion(const Block& block, + const Map& buffer_var_map) { + BlockReadWriteDetector detector(buffer_var_map); + detector(block); + return {detector.CollectReads(), detector.CollectWrites(), detector.CollectOpaques()}; +} + +TVM_REGISTER_GLOBAL("tir.analysis.get_block_access_region").set_body_typed(GetBlockAccessRegion); + +} // namespace tir +} // namespace tvm diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc new file mode 100644 index 0000000000000..7c9fff724e333 --- /dev/null +++ b/src/tir/ir/script/script_complete.cc @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/ir/script/script_complete.cc + * \brief Used by TVM Script parser to expand incomplete TIR input + */ + +#include +#include +#include +#include +#include + +#include + +namespace tvm { +namespace tir { + +/*! \brief Generate surrounding loops automatically */ +class ScriptCompleter : public StmtMutator { + public: + explicit ScriptCompleter(Map* buffer_var_map) : buffer_var_map_(buffer_var_map) {} + /*! \brief Whether the stmt contains at least one block. */ + bool contains_block = false; + + private: + Map* buffer_var_map_; + Stmt VisitStmt_(const BlockRealizeNode* op) override { + contains_block = true; + Stmt body = StmtMutator::VisitStmt_(op); + if (!op->iter_values.empty() && !op->iter_values[0].dtype().is_int()) { + auto block_with_binding = CopyOnWrite(Downcast(body).get()); + std::vector bindings; + for (size_t i = 0; i < op->iter_values.size(); ++i) { + bindings.push_back(Var("i" + std::to_string(i))); + } + block_with_binding->iter_values = bindings; + body = BlockRealize(block_with_binding); + for (int i = op->iter_values.size() - 1; i >= 0; --i) { + body = For(Downcast(bindings[i]), op->block->iter_vars[i]->dom->min, + op->block->iter_vars[i]->dom->extent, {}, body); + } + } + return body; + } + + Stmt VisitStmt_(const BlockNode* op) override { + // Buffers allocated in the block can be accessed by its body. + for (const auto& alloc_buffer : op->alloc_buffers) { + buffer_var_map_->Set(alloc_buffer->data, alloc_buffer); + } + Block block = Downcast(StmtMutator::VisitStmt_(op)); + // Remove buffers allocated inside block to detect its access region + for (const auto& alloc_buffer : op->alloc_buffers) { + buffer_var_map_->erase(alloc_buffer->data); + } + if (block->reads.empty() || block->writes.empty()) { + auto access_region = GetBlockAccessRegion(block, *buffer_var_map_); + const Array& reads = access_region[0]; + const Array& writes = access_region[1]; + const Array& opaque = access_region[2]; + CHECK(opaque.empty()) + << "ValueError: Can not auto detect buffer access region from tir.Load, tir.Store or " + "direct access by buffer data. Please annotation the access region manually"; + auto n = CopyOnWrite(block.operator->()); + if (!n->reads.defined()) n->reads = reads; + if (!n->writes.defined()) n->writes = writes; + return Block(n); + } else { + return std::move(block); + } + } +}; + +PrimFunc ScriptComplete(PrimFunc func, const Array& root_allocates) { + Map buffer_var_map; + for (const auto& pair : func->buffer_map) { + const Buffer& buffer = pair.second; + buffer_var_map.Set(buffer->data, buffer); + } + for (const auto& alloc : root_allocates) { + buffer_var_map.Set(alloc->data, alloc); + } + ScriptCompleter script_completer(&buffer_var_map); + // generate surrounding loops automatically + Stmt res = script_completer(func->body); + // generate root block automatically + if (script_completer.contains_block && + (!res->IsInstance() || !root_allocates.empty())) { + res = Block({}, {}, {}, "root", res, NullOpt, root_allocates); + res = BlockRealize({}, Bool(true), Downcast(res)); + } + if (func->body.same_as(res)) { + return func; + } else { + auto fptr = func.CopyOnWrite(); + fptr->body = res; + return func; + } +} + +TVM_REGISTER_GLOBAL("script.Complete").set_body_typed(ScriptComplete); + +} // namespace tir +} // namespace tvm diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py new file mode 100644 index 0000000000000..7e4d7d87c1e19 --- /dev/null +++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +from tvm import tir, script +from tvm.ir import Range + + +@tvm.script.tir +def func() -> None: + A = tir.alloc_buffer((128, 128), "float32") + B = tir.alloc_buffer((128, 128), "float32") + C = tir.alloc_buffer((128, 128), "float32") + D = tir.alloc_buffer((128, 128), "float32") + with tir.block([]): + # Need add read/write region manually to avoid triggering block access region detector + tir.reads([B[0, 0], C[0:16, 0:16], A[4:12, 4:12]]) + tir.writes([A[0:12, 0:12]]) + for i, j in tir.grid(8, 8): + A[i, j] = B[0, 0] + C[0, 0] + with tir.block([2, 2]) as [vi, vj]: + tir.reads([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8], C[12:16, 12:16]]) + tir.writes([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8]]) + for i, j in tir.grid(4, 4): + A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12] + tir.evaluate(D.data) + + +def test_block_access_region_detector(): + block = func.body.block.body.block + alloc_buffers = func.body.block.alloc_buffers + buffer_var_map = {buf.data: buf for buf in alloc_buffers} + ret = tir.analysis.get_block_access_region(block, buffer_var_map) + + tvm.ir.assert_structural_equal(block.reads, ret[0]) + tvm.ir.assert_structural_equal(block.writes, ret[1]) + D = alloc_buffers[-1] + tvm.ir.assert_structural_equal( + [tvm.tir.BufferRegion(D, [Range(0, 128), Range(0, 128)])], ret[2] + ) + + +if __name__ == "__main__": + test_block_access_region_detector() diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py index 048a9544d6df7..052217b32cb58 100644 --- a/tests/python/unittest/test_tvmscript_error_report.py +++ b/tests/python/unittest/test_tvmscript_error_report.py @@ -144,6 +144,197 @@ def test_no_body(): check_error(no_body, 3) +def allocate_with_buffers() -> None: + with tir.allocate([1], "float32", "") as [A, B]: # error + tir.evaluate(1.0) + + +def test_allocate_with_buffers(): + check_error(allocate_with_buffers, 2) + + +def inconsistent_binding() -> None: + with tir.block([128, 128]) as [vi]: # error + tir.evaluate(1.0) + + +def test_inconsistent_binding(): + check_error(inconsistent_binding, 2) + + +def invalid_block_axes(a: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + with tir.block([A]) as [vi]: # error + tir.evaluate(1.0) + + +def test_invalid_block_axes(): + check_error(invalid_block_axes, 3) + + +def miss_block_bind() -> None: + with tir.block([16, 16]) as [vi, vj]: # error + tir.bind(vi, 1) + tir.evaluate(1.0) + + +def test_miss_block_bind(): + check_error(miss_block_bind, 2) + + +def invalid_loop_var() -> None: + for i, j in range(0, 16): # error + tir.evaluate(1.0) + + +def test_invalid_loop_var(): + check_error(invalid_loop_var, 2) + + +def inconsistent_grid() -> None: + for i in tir.grid(16, 16): # error + tir.evaluate(1.0) + + +def test_inconsistent_grid(): + check_error(inconsistent_grid, 2) + + +def invalid_match_buffer_region() -> None: + with tir.block([16, 16]) as [vi, vj]: + A = tir.match_buffer_region(vi) # error + tir.evaluate(1.0) + + +def test_invalid_match_buffer_region(): + check_error(invalid_match_buffer_region, 3) + + +def duplicate_buffer() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + A = tir.alloc_buffer((128, 128), "float32") # error + tir.evaluate(1.0) + + +def test_duplicate_buffer(): + check_error(duplicate_buffer, 4) + + +def duplicate_reads() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + tir.reads(A[0:8, 0:8]) + tir.reads(A[0:16, 0:16]) # error + tir.evaluate(1.0) + + +def duplicate_writes() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + tir.writes(A[0:8, 0:8]) + tir.writes(A[0:16, 0:16]) # error + tir.evaluate(1.0) + + +def duplicate_predicate() -> None: + with tir.block([16, 16]) as [vi, vj]: + tir.where(1) + tir.where(0) # error + + +def duplicate_annotations() -> None: + with tir.block([16, 16]) as [vi, vj]: + tir.block_attr({}) + tir.block_attr({}) # error + + +def duplicate_init() -> None: + with tir.block([16, 16]) as [vi, vj]: + with tir.init(): + tir.evaluate(1.0) + with tir.init(): # error + tir.evaluate(1.0) + + +def test_duplicate_block_signature(): + check_error(duplicate_reads, 5) + check_error(duplicate_writes, 5) + check_error(duplicate_predicate, 4) + check_error(duplicate_annotations, 4) + check_error(duplicate_init, 5) + + +def opaque_access_during_complete(a: ty.handle) -> None: # error + A = tir.match_buffer(a, (16, 16), "float32") + with tir.block([16, 16]) as [vi, vj]: + tir.evaluate(tir.load("float32", A.data, vi * 16 + vj)) + + +def test_opaque_access_during_complete(): + check_error(opaque_access_during_complete, 1) + + +def convert_slice_to_bufferload() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + A[vi, vj] = A[vi : vi + 2, vj] + 1 # error + + +def test_convert_slice_to_bufferload(): + check_error(convert_slice_to_bufferload, 4) + + +def error_index_type() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + A[vi, vj] = A[vi, 0.0] + 1 # error + + +def test_error_index_type(): + check_error(error_index_type, 4) + + +def mismatch_args() -> None: + A = tir.alloc_buffer((128, 128), "float32") + with tir.block([16, 16]) as [vi, vj]: + tir.reads(A[0, 0], A[1, 1]) # error + tir.evaluate(1.0) + + +def test_mismatch_args(): + check_error(mismatch_args, 4) + + +def special_stmt_except() -> None: + A = tir.alloc_buffer("(128, 128)", "float32") # error + with tir.block([16, 16]) as [vi, vj]: + tir.evaluate(1.0) + + +def scope_handler_except() -> None: + for i in tir.serial("1", "1"): # error + tir.evaluate(1) + + +def intrin_except_unassign(a: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + tir.evaluate(A) # error + + +def intrin_except_assign(a: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + A[0, 0] = tir.load(A, A, A) # error + + +def test_tvm_exception_catch(): + # test catching c++ side exception + check_error(special_stmt_except, 2) + check_error(scope_handler_except, 2) + check_error(intrin_except_unassign, 3) + check_error(intrin_except_assign, 3) + + def check_error(module, rel_lineno): # Override the default renderer to accumulate errors _, start_line = inspect.getsourcelines(module) @@ -180,3 +371,17 @@ def render(e): test_return_not_allowed() test_tir_assert() test_no_body() + test_allocate_with_buffers() + test_inconsistent_binding() + test_invalid_block_axes() + test_miss_block_bind() + test_invalid_loop_var() + test_inconsistent_grid() + test_invalid_match_buffer_region() + test_duplicate_buffer() + test_duplicate_block_signature() + test_opaque_access_during_complete() + test_convert_slice_to_bufferload() + test_error_index_type() + test_mismatch_args() + test_tvm_exception_catch() diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index c7a38cccda493..a295908afa6a8 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -2662,6 +2662,169 @@ def test_opt_conv_tensorcore_mod_host(): tvm.ir.assert_structural_equal(mod, rt_mod, True) +@tvm.script.tir +def matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]: + with tir.init(): + C[vi, vj] = tir.float32(0) + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk] + + +@tvm.script.tir +def matmul_original(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + for i, j in tir.grid(128, 128): + with tir.block([128, 128], "init") as [vi, vj]: + C[vi, vj] = tir.float32(0) + + for k in range(0, 128): + with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]: + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk] + + +@tvm.script.tir +def element_wise(a: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128), "float32") + C = tir.match_buffer(c, (128, 128), "float32") + B = tir.alloc_buffer((128, 128), "float32") + + with tir.block([128, 128], "B") as [vi, vj]: + B[vi, vj] = A[vi, vj] * tir.float32(2) + + with tir.block([128, 128], "C") as [vi, vj]: + C[vi, vj] = B[vi, vj] + tir.float32(1) + + +@tvm.script.tir +def predicate(b: ty.handle, c: ty.handle) -> None: + B = tir.match_buffer(b, (16, 16), "float32") + C = tir.match_buffer(c, (16, 16), "float32") + + for i, jo, ji in tir.grid(16, 4, 5): + with tir.block([16, 16], "update") as [vi, vj]: + tir.bind(vi, i) + tir.bind(vj, jo * 4 + ji) + tir.where(jo * 4 + ji < 16) + C[vi, vj] = B[vi, vj] + tir.float32(1) + + +def test_module_define(): + func1 = tvm.script.create_module({"matmul": matmul})["matmul"] + func2 = tvm.script.create_module({"element_wise": element_wise})["element_wise"] + func3 = tvm.script.create_module({"predicate": predicate})["predicate"] + mod1 = tvm.script.create_module({"func1": func1, "func2": func2, "func3": func3}) + mod2 = tvm.script.create_module({"func1": matmul, "func2": element_wise, "func3": predicate}) + tvm.ir.assert_structural_equal(mod1, mod2) + + +def test_matmul(): + func = matmul + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + +def test_matmul_original(): + func = matmul_original + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + assert isinstance(rt_func.body.block, tir.stmt.Block) + assert isinstance(rt_func.body.block.body, tir.stmt.For) + assert isinstance(rt_func.body.block.body.body, tir.stmt.For) + assert isinstance(rt_func.body.block.body.body.body, tir.stmt.SeqStmt) + assert isinstance(rt_func.body.block.body.body.body[0].block, tir.stmt.Block) + assert isinstance(rt_func.body.block.body.body.body[1], tir.stmt.For) + assert isinstance(rt_func.body.block.body.body.body[1].body.block, tir.stmt.Block) + + +def test_element_wise(): + func = element_wise + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + assert isinstance(rt_func.body.block, tir.stmt.Block) + assert isinstance(rt_func.body.block.body, tir.stmt.SeqStmt) + assert isinstance(rt_func.body.block.body[0], tir.stmt.For) + assert isinstance(rt_func.body.block.body[0].body, tir.stmt.For) + assert isinstance(rt_func.body.block.body[0].body.body.block, tir.stmt.Block) + + assert isinstance(rt_func.body.block.body[1], tir.stmt.For) + assert isinstance(rt_func.body.block.body[1].body, tir.stmt.For) + assert isinstance(rt_func.body.block.body[1].body.body.block, tir.stmt.Block) + + +def test_predicate(): + func = predicate + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + assert isinstance(rt_func.body.block, tir.stmt.Block) + assert isinstance(rt_func.body.block.body, tir.stmt.For) + assert isinstance(rt_func.body.block.body.body, tir.stmt.For) + assert isinstance(rt_func.body.block.body.body.body, tir.stmt.For) + assert isinstance(rt_func.body.block.body.body.body.body.block, tir.stmt.Block) + + +@tvm.script.tir +def for_thread_binding(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + B = tir.match_buffer(b, (16, 16), "float32") + + for i in tir.thread_binding(0, 16, thread="threadIdx.x"): + for j in tir.thread_binding(0, 16, thread="threadIdx.y"): + A[i, j] = B[i, j] + tir.float32(1) + + +def test_for_thread_binding(): + func = for_thread_binding + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + assert isinstance(rt_func.body, tir.stmt.For) + assert rt_func.body.kind == 4 + assert rt_func.body.thread_binding.thread_tag == "threadIdx.x" + assert isinstance(rt_func.body.body, tir.stmt.For) + assert rt_func.body.body.kind == 4 + assert rt_func.body.body.thread_binding.thread_tag == "threadIdx.y" + + +@tvm.script.tir +def block_elements(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + B = tir.match_buffer(b, (1, 1), "float32") + + with tir.block([1], "update") as [vi]: + tir.bind(vi, 0) + tir.where(True) + tir.reads(A[0:16, 0:16]) + tir.writes(B[0, 0]) + tir.block_attr({"attr_key": "attr_value"}) + C = tir.alloc_buffer((4, 4), dtype="float32") + D = tir.match_buffer_region(A[0:4, 0]) + with tir.init(): + B[0, 0] = tir.float32(0) + B[0, 0] = A[0, 0] + B[0, 0] + C[1, 1] + D[2, 0] + + +def test_block_elements(): + func = block_elements + rt_func = tvm.script.from_source(tvm.script.asscript(func, True)) + tvm.ir.assert_structural_equal(func, rt_func) + + assert isinstance(rt_func.body.block, tir.stmt.Block) + assert isinstance(rt_func.body.block.body, tir.stmt.BufferStore) + assert isinstance(rt_func.body.block.init, tir.stmt.BufferStore) + assert len(rt_func.body.block.annotations) == 1 + assert rt_func.body.block.annotations["attr_key"] == "attr_value" + + if __name__ == "__main__": test_opt_gemm_normalize() test_opt_gemm_mod_host() @@ -2669,3 +2832,10 @@ def test_opt_conv_tensorcore_mod_host(): test_opt_conv_tensorcore_normalize() test_opt_conv_tensorcore_lower() test_opt_conv_tensorcore_mod_host() + test_module_define() + test_matmul() + test_matmul_original() + test_element_wise() + test_predicate() + test_for_thread_binding() + test_block_elements() diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh index f48ed49a22665..b880cb9d64572 100755 --- a/tests/scripts/task_ci_python_setup.sh +++ b/tests/scripts/task_ci_python_setup.sh @@ -30,4 +30,4 @@ set -o pipefail # echo "Addtiional setup in" ${CI_IMAGE_NAME} -python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1 +python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.3.0 diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh index 17838c58a83c4..9dda54e105236 100755 --- a/tests/scripts/task_ci_setup.sh +++ b/tests/scripts/task_ci_setup.sh @@ -30,7 +30,7 @@ set -o pipefail # echo "Addtiional setup in" ${CI_IMAGE_NAME} -python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1 +python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.3.0 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in # Jenkinsfile. We expect config.cmake to be present from pack_lib(). From 4b528de3edb49c2e3c36ceba79d57a2493aec01b Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Mon, 22 Mar 2021 20:30:21 +0800 Subject: [PATCH 51/84] [TensorIR] add TIRTextPrinter support for Block and BlockRealize (#7716) Co-authored-by: Junru Shao --- src/printer/text_printer.h | 2 + src/printer/tir_text_printer.cc | 106 ++++++++++++++++++++++++ tests/python/unittest/test_tir_nodes.py | 13 ++- 3 files changed, 120 insertions(+), 1 deletion(-) diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h index 6ec32a9e104c8..90e46c5624fad 100644 --- a/src/printer/text_printer.h +++ b/src/printer/text_printer.h @@ -310,6 +310,7 @@ class TIRTextPrinter : public StmtFunctor, Doc VisitStmt_(const ForNode* op) override; Doc VisitStmt_(const WhileNode* op) override; Doc VisitStmt_(const PrefetchNode* op) override; + Doc VisitStmt_(const BlockRealizeNode* op) override; Doc VisitStmtDefault_(const Object* op) override; Doc VisitType_(const PrimTypeNode* node) override; @@ -324,6 +325,7 @@ class TIRTextPrinter : public StmtFunctor, Doc PrintBuffer(const BufferNode* op); Doc BufferNode2Doc(const BufferNode* op, Doc doc); Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); } + Doc PrintBufferRegion(const BufferRegionNode* op); /*! * \brief special method to print out data type diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index 788074073c082..2976f4871f640 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -66,6 +66,8 @@ Doc TIRTextPrinter::Print(const ObjectRef& node) { return PrintBuffer(node.as()); } else if (node->IsInstance()) { return PrintString(node.as()); + } else if (node->IsInstance()) { + return PrintBufferRegion(node.as()); } else { return this->meta_->GetMetaNode(node); } @@ -217,6 +219,24 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) { return doc << ")"; } +Doc TIRTextPrinter::PrintBufferRegion(const BufferRegionNode* op) { + Doc doc; + doc << Print(op->buffer) << "["; + for (size_t i = 0; i < op->region.size(); ++i) { + if (i != 0) { + doc << ", "; + } + const auto& range = op->region[i]; + if (!is_one(range->extent)) { + doc << Print(range->min) << ":" << Print(range->min + range->extent); + } else { + doc << Print(range->min); + } + } + doc << "]"; + return doc; +} + Doc TIRTextPrinter::VisitExprDefault_(const Object* op) { return this->meta_->GetMetaNode(GetRef(op)); } @@ -506,6 +526,92 @@ Doc TIRTextPrinter::VisitStmt_(const PrefetchNode* op) { return doc; } +Doc TIRTextPrinter::VisitStmt_(const BlockRealizeNode* op) { + const auto* block_op = op->block.as(); + // print block name and block vars + Doc doc; + doc << "block(["; + std::vector block_var_docs; + for (const auto& iter_var : block_op->iter_vars) { + Doc block_var_doc; + if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) { + block_var_doc << Print(iter_var->dom->extent); + } else { + block_var_doc << "tir."; + switch (iter_var->iter_type) { + case kDataPar: + block_var_doc << "range"; + break; + case kCommReduce: + block_var_doc << "reduce_axis"; + break; + case kOrdered: + block_var_doc << "scan_axis"; + break; + case kOpaque: + block_var_doc << "opaque_axis"; + break; + default: + LOG(FATAL) << "Unknown block var iter type"; + break; + } + block_var_doc << "(" << Print(iter_var->dom->min) << ", " + << Print(iter_var->dom->min + iter_var->dom->extent) << ")"; + } + block_var_docs.push_back(block_var_doc); + } + doc << PrintSep(block_var_docs, Doc::Text(", ")) << "], "; + doc << Doc::StrLiteral(block_op->name_hint) << ")"; + std::vector block_var_names; + for (const auto& iter_var : block_op->iter_vars) { + Doc block_var_name; + AllocVar(iter_var->var); + block_var_names.push_back(Print(iter_var->var)); + } + if (!block_var_names.empty()) { + doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]"; + } + doc << " {"; + Doc block_attr_doc; + // print predicate, binding, read/write tensor region, annotations + if (!is_one(op->predicate)) { + block_attr_doc << Doc::NewLine() << "where(" << Print(op->predicate) << ")"; + } + for (size_t i = 0; i < block_op->iter_vars.size(); ++i) + block_attr_doc << Doc::NewLine() << "bind(" << Print(block_op->iter_vars[i]->var) << ", " + << Print(op->iter_values[i]) << ")"; + block_attr_doc << Doc::NewLine() << "tir.reads(" << Print(block_op->reads) << ")"; + block_attr_doc << Doc::NewLine() << "tir.writes(" << Print(block_op->writes) << ")"; + if (!block_op->annotations.empty()) { + std::vector attr_docs; + for (const auto& it : block_op->annotations) { + attr_docs.push_back(Doc::StrLiteral(it.first) << ": " << Print(it.second)); + } + block_attr_doc << Doc::NewLine() << "tir.attrs({" << PrintSep(attr_docs, Doc::Text(", ")) + << "})"; + } + // print body + Doc body; + body << Doc::NewLine(); + for (const auto& alloc_buf : block_op->alloc_buffers) { + body << AllocBuf(alloc_buf) << " = alloc_buffer(" << PrintDType(alloc_buf->dtype) + << Print(alloc_buf->shape) << ")" << Doc::NewLine(); + } + for (const auto& match_buf : block_op->match_buffers) { + body << AllocBuf(match_buf->buffer) << " = match_buffer_region(" << Print(match_buf->source) + << ")" << Doc::NewLine(); + } + if (block_op->init.defined()) { + Doc init_block; + init_block << "with init()"; + init_block << PrintBody(block_op->init.value()); + body << init_block << Doc::NewLine(); + } + body << Print(block_op->body); + doc << Doc::Indent(2, block_attr_doc << body); + return doc; +} + Doc TIRTextPrinter::VisitType_(const PrimTypeNode* node) { Doc doc; doc << PrintDType(node->dtype); diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py index 6e338d64a61c4..19e7bc89122e1 100644 --- a/tests/python/unittest/test_tir_nodes.py +++ b/tests/python/unittest/test_tir_nodes.py @@ -440,9 +440,20 @@ def test_block_blockrealize(): assert block_realize.predicate == tvm.tir.const(True, "bool") assert block_realize.block == block - # make sure we can print + # make sure we can print using ReprPrinter str(block) str(block_realize) + # make sure we can print using TIRTextPrinter + func = tvm.tir.PrimFunc([], block_realize) + output = func.astext() + assert output.find("meta[tir.BlockRealise]") == -1 + assert output.find("bind") != -1 + assert output.find("reads") != -1 + assert output.find("writes") != -1 + assert output.find("alloc_buffer") != -1 + assert output.find("match_buffer_region") != -1 + assert output.find("attr") != -1 + assert output.find("with init()") != -1 if __name__ == "__main__": From c4b89348cde17aed155c4831bae60b2ee4188796 Mon Sep 17 00:00:00 2001 From: tristan-arm Date: Mon, 22 Mar 2021 15:16:24 +0000 Subject: [PATCH 52/84] [ETHOSN] Add support for Ethos-N 21.02 driver stack release. (#7628) - Updated default Ethos-N driver stack to 21.02 - Fixed some test failures associated with this change --- .../ubuntu_install_ethosn_driver_stack.sh | 2 +- src/relay/backend/contrib/ethosn/codegen.cc | 30 +++++++++---------- .../contrib/ethosn/ethosn_api_version.h | 4 +++ .../contrib/test_ethosn/test_networks.py | 16 +++++++--- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh index 15b93bbdf901e..e73d1c35e2d22 100755 --- a/docker/install/ubuntu_install_ethosn_driver_stack.sh +++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh @@ -22,7 +22,7 @@ set -o pipefail repo_url="https://github.com/Arm-software/ethos-n-driver-stack" repo_dir="ethosn-driver" -repo_revision="20.08" +repo_revision="21.02" install_path="/opt/arm/$repo_dir" tmpdir=$(mktemp -d) diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index 5e052b3e4fd6f..dab0e6c42f804 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -198,7 +198,7 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId& op) { NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) { // Initialise everything -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 auto ctx = transform::PassContext::Current(); auto cfg = ctx->GetConfig("relay.ext.ethos-n.options"); if (!cfg.defined()) { @@ -206,7 +206,7 @@ NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) { } #endif NetworkWithIDs network_with_ids; -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 network_ = sl::CreateNetwork(variants[cfg.value()->variant]); #else network_ = sl::CreateNetwork(); @@ -572,7 +572,7 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() { cfg = AttrsWithDefaultValues(); } -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 sl::CompilationOptions options; #else sl::CompilationOptions options(variants[cfg.value()->variant]); @@ -619,7 +619,7 @@ std::pair, std::vector> EthosnCompiler::GetInput return std::make_pair(input_order, output_order); } -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 auto ctx = transform::PassContext::Current(); auto cfg = ctx -> GetConfig("relay.ext.ethos-n.options").defined() ? ctx -> GetConfig("relay.ext.ethos-n.options") @@ -632,7 +632,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d") Call call = args[0]; ConvolutionParams params; auto err = EthosnAPI::QnnConv2d(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 if (params.is_depthwise) { *rv = !err && m_Queries.IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info, @@ -657,7 +657,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc") Call call = args[0]; FullyConnectedParams params; auto err = EthosnAPI::QnnFullyConnected(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsFullyConnectedSupported(params.bias_info, params.weights_info, params.fc_info, params.input_info); #else @@ -671,7 +671,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d") Call call = args[0]; MaxPool2DParams params; auto err = EthosnAPI::MaxPool2D(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info); #else *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info); @@ -683,7 +683,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d") Call call = args[0]; AvgPool2DParams params; auto err = EthosnAPI::AvgPool2D(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info); #else *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info); @@ -695,7 +695,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape") Call call = args[0]; ReshapeParams params; auto err = EthosnAPI::Reshape(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsReshapeSupported(params.new_shape, params.input_info); #else *rv = !err && sl::IsReshapeSupported(params.new_shape, params.input_info); @@ -707,7 +707,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition") Call call = args[0]; AdditionParams params; auto err = EthosnAPI::Addition(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsAdditionSupported(params.lhs_info, params.rhs_info, params.output_quantization_info); #else @@ -721,7 +721,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid") Call call = args[0]; SigmoidParams params; auto err = EthosnAPI::Sigmoid(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsSigmoidSupported(params.input_info); #else *rv = !err && sl::IsSigmoidSupported(params.input_info); @@ -733,7 +733,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate") Call call = args[0]; ConcatenateParams params; auto err = EthosnAPI::Concatenate(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsConcatenationSupported(params.input_infos, params.concat_info); #else *rv = !err && sl::IsConcatenationSupported(params.input_infos, params.concat_info); @@ -745,7 +745,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.split") Call call = args[0]; SplitParams params; auto err = EthosnAPI::Split(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsSplitSupported(params.input_info, params.split_info); #else *rv = !err && sl::IsSplitSupported(params.input_info, params.split_info); @@ -757,7 +757,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space") Call call = args[0]; DepthToSpaceParams params; auto err = EthosnAPI::DepthToSpace(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsDepthToSpaceSupported(params.input_info, params.depth_info); #else *rv = !err && sl::IsDepthToSpaceSupported(params.input_info, params.depth_info); @@ -769,7 +769,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu") Call call = args[0]; ReluParams params; auto err = EthosnAPI::Relu(call, ¶ms); -#if _ETHOSN_API_VERSION_ == 2011 +#if _ETHOSN_API_VERSION_ >= 2011 *rv = !err && m_Queries.IsReluSupported(params.relu_info, params.input_info); #else *rv = !err && sl::IsReluSupported(params.relu_info, params.input_info); diff --git a/src/relay/backend/contrib/ethosn/ethosn_api_version.h b/src/relay/backend/contrib/ethosn/ethosn_api_version.h index 78f08950bb489..c975ee6e8ae81 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api_version.h +++ b/src/relay/backend/contrib/ethosn/ethosn_api_version.h @@ -29,6 +29,9 @@ * along with associated compatibility measures when no * longer necessary. */ +#if ETHOSN_SUPPORT_LIBRARY_VERSION_PATCH == 2 +#define _ETHOSN_API_VERSION_ 2102 +#else #ifndef ETHOSN_API_VERSION #define _ETHOSN_API_VERSION_ 2008 #elif ~(~ETHOSN_API_VERSION + 0) == 0 && ~(~ETHOSN_API_VERSION + 1) == 1 @@ -36,5 +39,6 @@ #else #define _ETHOSN_API_VERSION_ ETHOSN_API_VERSION #endif +#endif #endif // TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_VERSION_H_ diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index 06ce93b2aba5a..ce89c90d93796 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -122,7 +122,9 @@ def test_mobilenet_v1(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"81637c89339201a07dc96e3b5dbf836a"} + _compile_hash = {"bfb5a50607edb50009c58ae9d4287e4d"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"896c28b4f06341ea638ead3a593e1aed"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"47e216d8ab2bf491708ccf5620bc0d02"} if tei.get_ethosn_variant() == 3: @@ -150,7 +152,9 @@ def test_inception_v3(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"de0e175af610ebd45ccb03d170dc9664"} + _compile_hash = {"96116d7e6c7385de0688074a3f889983"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"551cde850c6ef960d19be4f317fb8e68"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"8c9d75659cd7bc9ff6dd6d490d28f9b2"} if tei.get_ethosn_variant() == 3: @@ -177,7 +181,9 @@ def test_inception_v4(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"06bf6cb56344f3904bcb108e54edfe87"} + _compile_hash = {"b34aec2a48c591818761ed6b42c133e5"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"30f078bd42757e8686eafa1f28d0d352"} if tei.get_ethosn_api_version() == 2008: if not tei.get_ethosn_variant() == 0: pytest.skip( @@ -206,7 +212,9 @@ def test_ssd_mobilenet_v1(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"29aec6b184b09454b4323271aadf89b1", "6211d96103880b016baa85e638abddef"} + _compile_hash = {"c312edfc9a946ed4dc7c049d472dae6e", "3183f0fa5eba8f6b9557d14eaf47842d"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"deee52e136327436411fc725624ae2ea", "6526509d3cbee014e38c79e22bb29d7f"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"5999f26e140dee0d7866491997ef78c5", "24e3a690a7e95780052792d5626c85be"} if tei.get_ethosn_variant() == 3: From 21fc3bb08e2cc4928e1cd06f2280fe83431c80f0 Mon Sep 17 00:00:00 2001 From: masahi Date: Tue, 23 Mar 2021 00:29:21 +0900 Subject: [PATCH 53/84] [TOPI] Use fixed thread block size in unique op for Vulkan (#7718) * [TOPI] Use fixed thread block size in unique op for Vulkan * forgot to add min for non vk backend --- python/tvm/topi/cuda/unique.py | 15 ++++++++-- .../unittest/test_target_codegen_spirv.py | 30 ++++++++++++++----- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py index 02a5cf3bc592b..2bca3c447c4c4 100644 --- a/python/tvm/topi/cuda/unique.py +++ b/python/tvm/topi/cuda/unique.py @@ -24,6 +24,15 @@ from ..utils import ceil_div +def _get_max_threads(batch_size): + target = tvm.target.Target.current() + max_threads = tvm.target.Target.current(allow_none=False).max_num_threads + if "vulkan" in str(target) and not isinstance(batch_size, tvm.tir.IntImm): + # SPIR-V does not support dynamic thread group size + return max_threads + return tir.min(batch_size, max_threads) + + def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): """Low level IR to calculate adjacent difference in an 1-D array. @@ -46,7 +55,7 @@ def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): data_ptr = ib.buffer_ptr(data) output_ptr = ib.buffer_ptr(output) batch_size = data.shape[0] - max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + max_threads = _get_max_threads(batch_size) with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(batch_size, max_threads) @@ -157,7 +166,7 @@ def _calc_unique_ir( unique_seq_indices_ptr = ib.buffer_ptr(indices) batch_size = data.shape[0] - max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + max_threads = _get_max_threads(batch_size) # if need to return counts if isinstance(counts, tir.Buffer): @@ -238,7 +247,7 @@ def _calc_first_occurence_ir(argsorted_indices, inc_scan, first_occurence): inc_scan_ptr = ib.buffer_ptr(inc_scan) first_occurence_ptr = ib.buffer_ptr(first_occurence) batch_size = argsorted_indices.shape[0] - max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + max_threads = _get_max_threads(batch_size) with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(batch_size, max_threads) diff --git a/tests/python/unittest/test_target_codegen_spirv.py b/tests/python/unittest/test_target_codegen_spirv.py index 68be5c480358c..bf47bbe2af44f 100644 --- a/tests/python/unittest/test_target_codegen_spirv.py +++ b/tests/python/unittest/test_target_codegen_spirv.py @@ -72,17 +72,18 @@ def do_copy(A, B, n): tvm.testing.assert_allclose(b.asnumpy(), ref) +def check_mod(mod, x_np, res_np): + target = "vulkan" + ctx = tvm.context(target, 0) + ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + res = ex.evaluate()(x_np).asnumpy() + tvm.testing.assert_allclose(res, res_np, atol=1e-5) + + def test_pushconstants(): if not tvm.testing.device_enabled("vulkan"): return - def check_mod(mod, x_np, res_np): - target = "vulkan" - ctx = tvm.context(target, 0) - ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) - res = ex.evaluate()(x_np).asnumpy() - tvm.testing.assert_allclose(res, res_np, atol=1e-5) - # Three 32 bit pushconstants: any_dim, stride, stride dtype = "float32" x = relay.var("x", shape=(relay.Any(),), dtype=dtype) @@ -104,6 +105,21 @@ def check_mod(mod, x_np, res_np): check_mod(mod, x_np, res_np) +def test_unique(): + if not tvm.testing.device_enabled("vulkan"): + return + + dtype = "int32" + x = relay.var("x", shape=(relay.Any(),), dtype=dtype) + mod = tvm.IRModule() + [unique, _, num_unique] = relay.unique(x, is_sorted=True) + mod["main"] = relay.Function([x], relay.op.strided_slice(unique, begin=[0], end=num_unique)) + x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype) + res_np = np.unique(x_np) + check_mod(mod, x_np, res_np) + + if __name__ == "__main__": test_bool_load() test_pushconstants() + test_unique() From 318c65035b70e1a40204a4148e47eb226e0cb8de Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Mon, 22 Mar 2021 19:03:58 +0300 Subject: [PATCH 54/84] Fix auto scheduler crash when set with consumers is empty (#7708) Set with consumers is empty during preparing auto scheduler sketches for Metal device. Added check on the size of the set. In case when the set with consumers is empty we just skip this rule. --- src/auto_scheduler/search_policy/sketch_policy_rules.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc index 8eaf803214563..8df69fc7ce3b9 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc @@ -164,6 +164,7 @@ SketchGenerationRule::ConditionKind RuleAddCacheRead::MeetCondition(const Sketch // Don't cache_read a stage if it has multiple consumers const std::set& consumers = GetConsumers(task, state, stage_id); + if (consumers.size() == 0) return ConditionKind::kSkip; // Don't cache_read a stage if its consumer does not need multi-level tiling int target_stage_id = *consumers.begin(); if (!NeedsMultilevelTiling(task, state, target_stage_id)) { From e4b3e9050c889a6742dc72ee95f21436e6e4d1be Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Mon, 22 Mar 2021 16:38:53 +0000 Subject: [PATCH 55/84] [CI] Improve docker/build.sh to accept a docker tag parameter. (#7707) * This adds a new '--tag' parameter so that we can build docker images on a particular tag, not only ':latest' as given by Docker * It opens up the possibility of generating "staging" images on a different tag, in the same servers as we keep the production images * By default it keeps previous behaviour of using ':latest' tag. --- docker/build.sh | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/docker/build.sh b/docker/build.sh index bd13937b25714..f966e22d035b9 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -20,11 +20,15 @@ # # Execute command within a docker container # -# Usage: build.sh [--dockerfile ] [-it] +# Usage: build.sh [--tag ] +# [--dockerfile ] [-it] # [--net=host] [--cache-from ] # -# CONTAINER_TYPE: Type of the docker container used the run the build: e.g., -# (cpu | gpu) +# CONTAINER_TYPE: Type of the docker container used the run the build, +# e.g. "ci_cpu", "ci_gpu" +# +# DOCKER_IMAGE_TAG: (Optional) Docker image tag to be built and used. +# Defaults to 'latest', as it is the default Docker tag. # # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If # this optional value is not supplied (via the --dockerfile @@ -45,6 +49,12 @@ shift 1 DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" +if [[ "$1" == "--tag" ]]; then + DOCKER_IMAGE_TAG="$2" + echo "Using custom Docker tag: ${DOCKER_IMAGE_TAG}" + shift 2 +fi + if [[ "$1" == "--dockerfile" ]]; then DOCKERFILE_PATH="$2" DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}") @@ -114,6 +124,7 @@ function upsearch () { # reasonable defaults if you run it outside of Jenkins. WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../}" BUILD_TAG="${BUILD_TAG:-tvm}" +DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-latest}" # Determine the docker image name DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}" @@ -125,6 +136,9 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') # Convert to all lower-case, as per requirement of Docker image names DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]') +# Compose the full image spec with "name:tag" e.g. "tvm.ci_cpu:v0.03" +DOCKER_IMG_SPEC="${DOCKER_IMG_NAME}:${DOCKER_IMAGE_TAG}" + # Print arguments. echo "WORKSPACE: ${WORKSPACE}" echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}" @@ -132,12 +146,14 @@ echo "COMMAND: ${COMMAND[@]}" echo "CONTAINER_TYPE: ${CONTAINER_TYPE}" echo "BUILD_TAG: ${BUILD_TAG}" echo "DOCKER CONTAINER NAME: ${DOCKER_IMG_NAME}" +echo "DOCKER_IMAGE_TAG: ${DOCKER_IMAGE_TAG}" +echo "DOCKER_IMG_SPEC: ${DOCKER_IMG_SPEC}" echo "" # Build the docker container. echo "Building container (${DOCKER_IMG_NAME})..." -docker build -t ${DOCKER_IMG_NAME} \ +docker build -t ${DOCKER_IMG_SPEC} \ -f "${DOCKERFILE_PATH}" \ ${CI_DOCKER_BUILD_EXTRA_PARAMS[@]} \ "${DOCKER_CONTEXT_PATH}" @@ -149,7 +165,7 @@ if [[ $? != "0" ]]; then fi # Run the command inside the container. -echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..." +echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_SPEC}..." # By default we cleanup - remove the container once it finish running (--rm) # and share the PID namespace (--pid=host) so the process inside does not have @@ -167,6 +183,6 @@ ${DOCKER_BINARY} run --rm --pid=host \ -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \ ${CUDA_ENV}\ ${CI_DOCKER_EXTRA_PARAMS[@]} \ - ${DOCKER_IMG_NAME} \ + ${DOCKER_IMG_SPEC} \ bash --login docker/with_the_same_user \ ${COMMAND[@]} From 43ec869ec038fd918cb888a61837f898c254891b Mon Sep 17 00:00:00 2001 From: Ambroise Vincent Date: Mon, 22 Mar 2021 21:46:23 +0000 Subject: [PATCH 56/84] Fix graph_tuner ancestor duplication (#7704) A diamond dependency currently generates a duplication of ancestors: A / \ B C \ / D under some conditions, this scenario leads to the following dictionnary entry: "D":[A, A] instead of "D":[A]. This results in failures when subsequently trying to transpose node states based on this data. Change-Id: I72f9b19286bbab0581b851c228b9d0e79ead400f --- python/tvm/autotvm/graph_tuner/utils/traverse_graph.py | 3 ++- .../python/unittest/test_autotvm_graph_tuner_utils.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index 6e29474ab399b..fd2612f20371c 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -211,7 +211,8 @@ def get_direct_ancestor(node_list, visited_dict, target_ops, node_idx, input_nam else: tmp = get_direct_ancestor(node_list, visited_dict, target_ops, item_idx[0], input_names) for tmp_item in tmp: - node_direct_ancestor.append(tmp_item) + if tmp_item not in node_direct_ancestor: + node_direct_ancestor.append(tmp_item) visited_dict[node_idx] = node_direct_ancestor return node_direct_ancestor diff --git a/tests/python/unittest/test_autotvm_graph_tuner_utils.py b/tests/python/unittest/test_autotvm_graph_tuner_utils.py index 6ab194c10ea79..3f6d3980ee283 100644 --- a/tests/python/unittest/test_autotvm_graph_tuner_utils.py +++ b/tests/python/unittest/test_autotvm_graph_tuner_utils.py @@ -109,6 +109,16 @@ def test_get_direct_ancestor(): out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names) assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out) + # non-regression test + out = relay.add(relay.log(data), relay.sqrt(data)) + net = relay.Function(relay.analysis.free_vars(out), out) + net = bind_inputs(net, {"data": (1, 16, 224, 224)}) + node_list = [] + node_dict = {} + expr2graph(net, target_ops, node_dict, node_list) + out = get_direct_ancestor(node_list, visited_dict, target_ops, 3, input_names) + assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out) + def test_get_in_nodes(): data = relay.var("data") From 4c66fb2e4b99e376fbaec15d975e4e4d1d8321ab Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Tue, 23 Mar 2021 07:18:04 +0000 Subject: [PATCH 57/84] Fix GraphModule.load_params to allow passing parameters that are not an expected input (#7665) --- src/runtime/graph/graph_runtime.cc | 4 +- tests/python/relay/test_external_codegen.py | 59 +++++++++++++++------ tests/python/unittest/test_runtime_graph.py | 24 ++++++++- 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 5c7b756961684..b11a57352f547 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -201,7 +201,9 @@ void GraphRuntime::LoadParams(const std::string& param_blob) { void GraphRuntime::LoadParams(dmlc::Stream* strm) { Map params = ::tvm::runtime::LoadParams(strm); for (auto& p : params) { - uint32_t eid = this->entry_id(input_nodes_[GetInputIndex(p.first)], 0); + int in_idx = GetInputIndex(p.first); + if (in_idx < 0) continue; + uint32_t eid = this->entry_id(input_nodes_[in_idx], 0); data_entry_[eid].CopyFrom(p.second); } } diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py index 0d729b7b1b94a..ab6695eef0fb3 100644 --- a/tests/python/relay/test_external_codegen.py +++ b/tests/python/relay/test_external_codegen.py @@ -23,9 +23,29 @@ from tvm import te import tvm.relay.testing import tvm.relay.transform + from tvm import relay from tvm import runtime +from tvm.relay import transform from tvm.contrib import utils +from tvm.relay.build_module import bind_params_by_name +from tvm.relay.op.annotation import compiler_begin, compiler_end + + +def update_lib(lib): + test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + source_dir = os.path.join(test_dir, "..", "..", "..") + contrib_path = os.path.join(source_dir, "src", "runtime", "contrib") + + kwargs = {} + kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path] + tmp_path = utils.tempdir() + lib_name = "lib.so" + lib_path = tmp_path.relpath(lib_name) + lib.export_library(lib_path, fcompile=False, **kwargs) + lib = tvm.runtime.load_module(lib_path) + + return lib def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu()): @@ -33,21 +53,6 @@ def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ct print("Skip test on Windows for now") return - def update_lib(lib): - test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - source_dir = os.path.join(test_dir, "..", "..", "..") - contrib_path = os.path.join(source_dir, "src", "runtime", "contrib") - - kwargs = {} - kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path] - tmp_path = utils.tempdir() - lib_name = "lib.so" - lib_path = tmp_path.relpath(lib_name) - lib.export_library(lib_path, fcompile=False, **kwargs) - lib = tvm.runtime.load_module(lib_path) - - return lib - def check_vm_result(): with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): exe = relay.vm.compile(mod, target=target) @@ -329,6 +334,29 @@ def test_extern_dnnl_const(): check_result(mod, {"data0": i_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5) +def test_load_params_with_constants_in_ext_codegen(): + # After binding params and partitioning graph_module.get_params() + # might contain parameters that are not an graph runtime input but + # for example constants in external function. + y_in = np.ones((1,)).astype("float32") + params = {"y": y_in} + mod = tvm.IRModule() + x = relay.var("x", shape=(1, 10)) + y = relay.var("y", shape=(1,)) + xcb = compiler_begin(x, "ccompiler") + ycb = compiler_begin(y, "ccompiler") + z = relay.add(xcb, ycb) + zce = compiler_end(z, "ccompiler") + mod["main"] = relay.Function([x, y], zce) + mod["main"] = bind_params_by_name(mod["main"], params) + mod = transform.PartitionGraph()(mod) + + graph_module = relay.build(mod, target="llvm", params=params) + lib = update_lib(graph_module.get_lib()) + rt_mod = tvm.contrib.graph_runtime.create(graph_module.get_json(), lib, tvm.cpu(0)) + rt_mod.load_params(runtime.save_param_dict(graph_module.get_params())) + + if __name__ == "__main__": test_multi_node_subgraph() test_extern_gcc_single_op() @@ -337,3 +365,4 @@ def test_extern_dnnl_const(): test_extern_gcc_consts() test_extern_dnnl() test_extern_dnnl_const() + test_load_params_with_constants_in_ext_codegen() diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py index 16e9db42cba33..fe33c0f4f999e 100644 --- a/tests/python/unittest/test_runtime_graph.py +++ b/tests/python/unittest/test_runtime_graph.py @@ -20,6 +20,7 @@ import numpy as np import json from tvm import rpc +from tvm import relay from tvm.contrib import utils, graph_runtime @@ -82,8 +83,6 @@ def check_remote(): np.testing.assert_equal(out.asnumpy(), a + 1) def check_sharing(): - from tvm import relay - x = relay.var("x", shape=(1, 10)) y = relay.var("y", shape=(1, 10)) z = relay.add(x, y) @@ -120,5 +119,26 @@ def check_sharing(): check_sharing() +def test_load_unexpected_params(): + # Test whether graph_runtime.load_params works if parameters + # are provided that are not an expected input. + mod = tvm.IRModule() + params = {} + x = relay.var("x", shape=(1, 10)) + y = relay.var("y", shape=(1, 10)) + z = relay.add(x, y) + mod["main"] = relay.Function([x, y], z) + + graph_module = relay.build(mod, target="llvm", params=params) + rt_mod = tvm.contrib.graph_runtime.create( + graph_module.get_json(), graph_module.get_lib(), tvm.cpu(0) + ) + + new_params = graph_module.get_params() + new_params.update({"y_unknown": np.ones((1,)).astype("float32")}) + rt_mod.load_params(runtime.save_param_dict(new_params)) + + if __name__ == "__main__": test_graph_simple() + test_load_unexpected_params() From f09f02e575b2bd1d9187a4ff2eb178d49fd3dd22 Mon Sep 17 00:00:00 2001 From: Christoph Gerum Date: Tue, 23 Mar 2021 09:57:15 +0100 Subject: [PATCH 58/84] [TORCH] Implement avg_pool1d (#7694) * [TORCH] Implement avg_pool1d * [TORCH] Unify creation of avg_pooling operations * [TORCH] Add tests for avg pooling with padding * [TORCH] Make format checks happy with unified avg_pool --- python/tvm/relay/frontend/pytorch.py | 84 ++++++++++--------- tests/python/frontend/pytorch/test_forward.py | 28 ++++++- 2 files changed, 72 insertions(+), 40 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 8ae1e862ffd52..cb9ea6a043f45 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1353,47 +1353,54 @@ def softplus(self, inputs, input_types): beta = _expr.const(float(inputs[1]), dtype=dtype) return _op.log(_op.exp(inputs[0] * beta) + _expr.const(1.0, dtype=dtype)) / beta - def avg_pool2d(self, inputs, input_types): - data = inputs[0] - - pool_size = self.convert_const_list(inputs[1]) - strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size) - padding = inputs[3] - ceil_mode = int(inputs[4]) - count_include_pad = int(inputs[5]) - - def func(x): - return _op.nn.avg_pool2d( - x, - pool_size=pool_size, - strides=strides, - padding=padding, - ceil_mode=ceil_mode, - count_include_pad=count_include_pad, - ) + def make_avg_pool(self, dim): + def avg_pool(inputs, input_types): + data = inputs[0] - if self.is_quantized_tensor(data): - return qnn_torch.apply_with_upcast(data, func) + pool_size = self.convert_const_list(inputs[1]) + strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size) + padding = inputs[3] + ceil_mode = int(inputs[4]) + count_include_pad = int(inputs[5]) - return func(data) + def func(x): + if dim == 1: + return _op.nn.avg_pool1d( + x, + pool_size=pool_size, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + elif dim == 2: + return _op.nn.avg_pool2d( + x, + pool_size=pool_size, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + elif dim == 3: + return _op.nn.avg_pool3d( + x, + pool_size=pool_size, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + else: + msg = "Average Pooling dimension should be between 1 and 3" + raise RuntimeError(msg) - def avg_pool3d(self, inputs, input_types): - data = inputs[0] + if self.is_quantized_tensor(data): + return qnn_torch.apply_with_upcast(data, func) - pool_size = inputs[1] - strides = inputs[2] if inputs[2] else pool_size - padding = inputs[3] - ceil_mode = int(inputs[4]) - count_include_pad = int(inputs[5]) + return func(data) - return _op.nn.avg_pool3d( - data, - pool_size=pool_size, - strides=strides, - padding=padding, - ceil_mode=ceil_mode, - count_include_pad=count_include_pad, - ) + return avg_pool def linear(self, inputs, input_types): # https://pytorch.org/docs/stable/nn.functional.html#linear @@ -2350,8 +2357,9 @@ def create_convert_map(self): "aten::log_softmax": self.log_softmax, "aten::sigmoid": self.sigmoid, "aten::softplus": self.softplus, - "aten::avg_pool2d": self.avg_pool2d, - "aten::avg_pool3d": self.avg_pool3d, + "aten::avg_pool1d": self.make_avg_pool(1), + "aten::avg_pool2d": self.make_avg_pool(2), + "aten::avg_pool3d": self.make_avg_pool(3), "aten::linear": self.linear, "aten::dropout": self.dropout, "aten::dropout_": self.dropout, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index d0edfd9c80364..572aa472c5400 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -809,7 +809,24 @@ def forward(self, *args): @tvm.testing.uses_gpu -def test_forward_avgpool(): +def test_forward_avgpool1d(): + torch.set_grad_enabled(False) + input_shape = [1, 3, 10] + + class AvgPool1D2(Module): + def forward(self, *args): + return torch.nn.functional.avg_pool1d(args[0], kernel_size=[10]) + + input_data = torch.rand(input_shape).float() + verify_model(torch.nn.AvgPool1d(kernel_size=[10]).eval(), input_data=input_data) + verify_model(AvgPool1D2().float().eval(), input_data=input_data) + verify_model( + torch.nn.AvgPool1d(kernel_size=[5], stride=2, padding=2).eval(), input_data=input_data + ) + + +@tvm.testing.uses_gpu +def test_forward_avgpool2d(): torch.set_grad_enabled(False) input_shape = [1, 3, 10, 10] @@ -820,6 +837,9 @@ def forward(self, *args): input_data = torch.rand(input_shape).float() verify_model(torch.nn.AvgPool2d(kernel_size=[10, 10]).eval(), input_data=input_data) verify_model(AvgPool2D2().float().eval(), input_data=input_data) + verify_model( + torch.nn.AvgPool2d(kernel_size=5, stride=2, padding=2).eval(), input_data=input_data + ) @tvm.testing.uses_gpu @@ -834,6 +854,9 @@ def forward(self, *args): input_data = torch.rand(input_shape).float() verify_model(torch.nn.AvgPool3d(kernel_size=[10, 10, 10]).eval(), input_data=input_data) verify_model(AvgPool3D1().float().eval(), input_data=input_data) + verify_model( + torch.nn.AvgPool3d(kernel_size=5, stride=2, padding=2).eval(), input_data=input_data + ) @tvm.testing.uses_gpu @@ -3838,7 +3861,8 @@ def test_fn(is_sorted, return_inverse, return_counts): test_forward_logsoftmax() test_forward_sigmoid() test_forward_dense() - test_forward_avgpool() + test_forward_avgpool1d() + test_forward_avgpool2d() test_forward_avgpool3d() test_forward_dropout() test_forward_slice() From 37e6df1a2654c3a06f3bdfb36fb107fa7a8265eb Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Tue, 23 Mar 2021 16:39:10 +0300 Subject: [PATCH 59/84] [METAL] Fix memory leaks in Metal runtime (#7714) * [METAL] Fix memory leaks in Metal runtime 1. In case when we build runtime without ARC, we can have problems with memory releasing. Due to some of Objective-C methods returns autoreleased pointers, we should specify `autoreleasepool` blocks to determine life cycle of these pointers. 2. Added workaround for problem with work group size. Sometimes auto scheduler generates parameters when work group size is more than possible. And in this case we got assert from Metal library. Added check for this situation and it helps to avoid assert. 3. Fixed memory leak problem when fill tensor by random data. DLManagedTensor increases reference counter in NDArray but nobody delete this DLManagedTensor in proper way. This is why memory which was allocated by NDArray was never released. 4. Removed unnecessary retains. It is not necessary use retain in some places where they were used, due to we build metal runtime without ARC. * Use const_cast instead of creation DLManagedTensor --- .../contrib/random/mt_random_engine.cc | 5 +- src/runtime/metal/metal_device_api.mm | 258 ++++++++++-------- src/runtime/metal/metal_module.mm | 88 +++--- 3 files changed, 189 insertions(+), 162 deletions(-) diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc index 699f6bbcf3762..81f46b2dd5d5f 100644 --- a/src/runtime/contrib/random/mt_random_engine.cc +++ b/src/runtime/contrib/random/mt_random_engine.cc @@ -126,8 +126,9 @@ class RandomEngine { } else { runtime::NDArray local = runtime::NDArray::Empty( std::vector{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0}); - FillData(&local.ToDLPack()->dl_tensor, size); - runtime::NDArray::CopyFromTo(&local.ToDLPack()->dl_tensor, data); + DLTensor* tensor = const_cast(local.operator->()); + FillData(tensor, size); + runtime::NDArray::CopyFromTo(tensor, data); } } diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 0169a4c2ec285..3d7abd1340353 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -30,50 +30,54 @@ namespace metal { MetalWorkspace* MetalWorkspace::Global() { - // NOTE: explicitly use new to avoid exit-time destruction of global state - // Global state will be recycled by OS as the process exits. - static MetalWorkspace* inst = new MetalWorkspace(); - return inst; + @autoreleasepool { + // NOTE: explicitly use new to avoid exit-time destruction of global state + // Global state will be recycled by OS as the process exits. + static MetalWorkspace* inst = new MetalWorkspace(); + return inst; + } } void MetalWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { - this->Init(); - size_t index = static_cast(ctx.device_id); - if (kind == kExist) { - *rv = int(index < devices.size()); - return; - } - ICHECK_LT(index, devices.size()) << "Invalid device id " << index; - switch (kind) { - case kMaxThreadsPerBlock: { - *rv = static_cast([devices[ctx.device_id] maxThreadsPerThreadgroup].width); - break; + @autoreleasepool { + this->Init(); + size_t index = static_cast(ctx.device_id); + if (kind == kExist) { + *rv = int(index < devices.size()); + return; } - case kWarpSize: { - // Set warp size to be 1 for safty reason. - *rv = 1; - break; + ICHECK_LT(index, devices.size()) << "Invalid device id " << index; + switch (kind) { + case kMaxThreadsPerBlock: { + *rv = static_cast([devices[ctx.device_id] maxThreadsPerThreadgroup].width); + break; + } + case kWarpSize: { + // Set warp size to be 1 for safty reason. + *rv = 1; + break; + } + case kMaxSharedMemoryPerBlock: + return; + case kComputeVersion: + return; + case kDeviceName: + return; + case kMaxClockRate: + return; + case kMultiProcessorCount: + return; + case kMaxThreadDimensions: + return; + case kExist: + return; + case kMaxRegistersPerBlock: + return; + case kGcnArch: + return; + case kApiVersion: + return; } - case kMaxSharedMemoryPerBlock: - return; - case kComputeVersion: - return; - case kDeviceName: - return; - case kMaxClockRate: - return; - case kMultiProcessorCount: - return; - case kMaxThreadDimensions: - return; - case kExist: - return; - case kMaxRegistersPerBlock: - return; - case kGcnArch: - return; - case kApiVersion: - return; } } @@ -106,7 +110,11 @@ int GetWarpSize(id dev) { ICHECK(f != nil); id state = [dev newComputePipelineStateWithFunction:f error:&error_msg]; ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String]; - return static_cast(state.threadExecutionWidth); + int size = static_cast(state.threadExecutionWidth); + [state release]; + [f release]; + [lib release]; + return size; } MetalWorkspace::~MetalWorkspace() { @@ -127,14 +135,14 @@ int GetWarpSize(id dev) { #if TARGET_OS_IPHONE // on iPhone id d = MTLCreateSystemDefaultDevice(); - devices.push_back([d retain]); - queues.push_back([[d newCommandQueue] retain]); + devices.push_back(d); + queues.push_back([d newCommandQueue]); #else NSArray >* devs = MTLCopyAllDevices(); for (size_t i = 0; i < devs.count; ++i) { id d = [devs objectAtIndex:i]; - devices.push_back([d retain]); - queues.push_back([[d newCommandQueue] retain]); + devices.push_back(d); + queues.push_back([d newCommandQueue]); LOG(INFO) << "Intializing Metal device " << i << ", name=" << [d.name UTF8String]; warp_size.push_back(GetWarpSize(d)); } @@ -147,102 +155,110 @@ int GetWarpSize(id dev) { void* MetalWorkspace::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) { - this->Init(); - id dev = GetDevice(ctx); - // GPU memory only - MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; - /* - #if TARGET_OS_IPHONE - storage_mode = MTLResourceStorageModeShared; - #else - storage_mode = MTLResourceStorageModeManaged; - #endif - */ - id buf = [dev newBufferWithLength:nbytes options:storage_mode]; - ICHECK(buf != nil); - return (void*)(CFBridgingRetain(buf)); + @autoreleasepool { + this->Init(); + id dev = GetDevice(ctx); + // GPU memory only + MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; + /* + #if TARGET_OS_IPHONE + storage_mode = MTLResourceStorageModeShared; + #else + storage_mode = MTLResourceStorageModeManaged; + #endif + */ + id buf = [dev newBufferWithLength:nbytes options:storage_mode]; + ICHECK(buf != nil); + return (void*)(buf); + } } void MetalWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { - // MTLBuffer PurgeableState should be set to empty before manual - // release in order to prevent memory leak - [(id)ptr setPurgeableState:MTLPurgeableStateEmpty]; - // release the ptr. - CFRelease(ptr); + @autoreleasepool { + // MTLBuffer PurgeableState should be set to empty before manual + // release in order to prevent memory leak + [(id)ptr setPurgeableState:MTLPurgeableStateEmpty]; + // release the ptr. + CFRelease(ptr); + } } void MetalWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) { - this->Init(); - ICHECK(stream == nullptr); - TVMContext ctx = ctx_from; - if (ctx_from.device_type == kDLCPU) ctx = ctx_to; - id queue = GetCommandQueue(ctx); - id cb = [queue commandBuffer]; - int from_dev_type = static_cast(ctx_from.device_type); - int to_dev_type = static_cast(ctx_to.device_type); + @autoreleasepool { + this->Init(); + ICHECK(stream == nullptr); + TVMContext ctx = ctx_from; + if (ctx_from.device_type == kDLCPU) ctx = ctx_to; + id queue = GetCommandQueue(ctx); + id cb = [queue commandBuffer]; + int from_dev_type = static_cast(ctx_from.device_type); + int to_dev_type = static_cast(ctx_to.device_type); - if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { - ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; - id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:(__bridge id)(from) - sourceOffset:from_offset - toBuffer:(__bridge id)(to)destinationOffset:to_offset - size:size]; - [encoder endEncoding]; - [cb commit]; - } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) { - // copy to a local buffer before get into global buffer. - id from_buf = (__bridge id)(from); - if (from_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size); + if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { + ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:from_buf + [encoder copyFromBuffer:(id)(from) sourceOffset:from_offset - toBuffer:temp - destinationOffset:0 - size:size]; - [encoder endEncoding]; - [cb commit]; - [cb waitUntilCompleted]; - memcpy(static_cast(to) + to_offset, static_cast([temp contents]), size); - } else { - memcpy(static_cast(to) + to_offset, - static_cast([from_buf contents]) + from_offset, size); - } - } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { - id to_buf = (__bridge id)(to); - if (to_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size); - memcpy([temp contents], static_cast(from) + from_offset, size); - id encoder = [cb blitCommandEncoder]; - [encoder copyFromBuffer:temp - sourceOffset:0 - toBuffer:to_buf - destinationOffset:to_offset + toBuffer:(id)(to)destinationOffset:to_offset size:size]; [encoder endEncoding]; [cb commit]; - [cb waitUntilCompleted]; + } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) { + // copy to a local buffer before get into global buffer. + id from_buf = (id)(from); + if (from_buf.storageMode != MTLStorageModeShared) { + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size); + id encoder = [cb blitCommandEncoder]; + [encoder copyFromBuffer:from_buf + sourceOffset:from_offset + toBuffer:temp + destinationOffset:0 + size:size]; + [encoder endEncoding]; + [cb commit]; + [cb waitUntilCompleted]; + memcpy(static_cast(to) + to_offset, static_cast([temp contents]), size); + } else { + memcpy(static_cast(to) + to_offset, + static_cast([from_buf contents]) + from_offset, size); + } + } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { + id to_buf = (id)(to); + if (to_buf.storageMode != MTLStorageModeShared) { + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size); + memcpy([temp contents], static_cast(from) + from_offset, size); + id encoder = [cb blitCommandEncoder]; + [encoder copyFromBuffer:temp + sourceOffset:0 + toBuffer:to_buf + destinationOffset:to_offset + size:size]; + [encoder endEncoding]; + [cb commit]; + [cb waitUntilCompleted]; + } else { + memcpy(static_cast([to_buf contents]) + to_offset, + static_cast(from) + from_offset, size); + } } else { - memcpy(static_cast([to_buf contents]) + to_offset, - static_cast(from) + from_offset, size); + LOG(FATAL) << "Expect copy from/to Metal or between Metal" + << ", from=" << from_dev_type << ", to=" << to_dev_type; } - } else { - LOG(FATAL) << "Expect copy from/to Metal or between Metal" - << ", from=" << from_dev_type << ", to=" << to_dev_type; } } void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { - ICHECK(stream == nullptr); - // commit an empty command buffer and wait until it completes. - id queue = GetCommandQueue(ctx); - id cb = [queue commandBuffer]; - [cb commit]; - [cb waitUntilCompleted]; + @autoreleasepool { + ICHECK(stream == nullptr); + // commit an empty command buffer and wait until it completes. + id queue = GetCommandQueue(ctx); + id cb = [queue commandBuffer]; + [cb commit]; + [cb waitUntilCompleted]; + } } void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { @@ -269,10 +285,10 @@ int GetWarpSize(id dev) { if (temp_buffer_[ctx.device_id] == nil || temp_buffer_[ctx.device_id].length < size) { id dev = MetalWorkspace::Global()->GetDevice(ctx); if (temp_buffer_[ctx.device_id] != nil) { + [temp_buffer_[ctx.device_id] setPurgeableState:MTLPurgeableStateEmpty]; [temp_buffer_[ctx.device_id] release]; } - temp_buffer_[ctx.device_id] = [[dev newBufferWithLength:size - options:MTLStorageModeShared] retain]; + temp_buffer_[ctx.device_id] = [dev newBufferWithLength:size options:MTLStorageModeShared]; } return temp_buffer_[ctx.device_id]; } diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm index 8f1fde86f0749..c7e2d8b7b4bc7 100644 --- a/src/runtime/metal/metal_module.mm +++ b/src/runtime/metal/metal_module.mm @@ -113,7 +113,6 @@ void SaveToBinary(dmlc::Stream* stream) final { LOG(FATAL) << "Fail to compile metal lib:" << [[err_msg localizedDescription] UTF8String]; } } - [e.lib retain]; } id f = [e.lib newFunctionWithName:[NSString stringWithUTF8String:func_name.c_str()]]; @@ -123,11 +122,13 @@ void SaveToBinary(dmlc::Stream* stream) final { ICHECK(state != nil) << "cannot get state:" << " for function " << func_name << [[err_msg localizedDescription] UTF8String]; + [f release]; // The state.threadExecutionWidth can change dynamically according // to the resource constraint in kernel, so it is not strictly hold // Turn of warp aware optimziation for now. // ICHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]); - e.smap[func_name] = [state retain]; + if (e.smap[func_name] != nil) [e.smap[func_name] release]; + e.smap[func_name] = state; return state; } @@ -181,31 +182,36 @@ void Init(MetalModuleNode* m, ObjectPtr sptr, const std::string& func_na } // invoke the function with void arguments void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const { - metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); - int device_id = t->context.device_id; - if (scache_[device_id] == nil) { - scache_[device_id] = m_->GetPipelineState(device_id, func_name_); - } - ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); - id queue = w_->GetCommandQueue(t->context); - id cb = [queue commandBuffer]; - id encoder = [cb computeCommandEncoder]; - [encoder setComputePipelineState:scache_[device_id]]; - for (size_t i = 0; i < num_buffer_args_; ++i) { - void* buf = args[static_cast(i)]; - [encoder setBuffer:(__bridge id)(buf) offset:0 atIndex:i]; - } - if (num_pack_args_ != 0) { - [encoder setBytes:pack_args - length:num_pack_args_ * sizeof(ArgUnion64) - atIndex:num_buffer_args_]; + @autoreleasepool { + metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); + int device_id = t->context.device_id; + if (scache_[device_id] == nil) { + scache_[device_id] = m_->GetPipelineState(device_id, func_name_); + } + ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); + int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2); + auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup; + CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup); + id queue = w_->GetCommandQueue(t->context); + id cb = [queue commandBuffer]; + id encoder = [cb computeCommandEncoder]; + [encoder setComputePipelineState:scache_[device_id]]; + for (size_t i = 0; i < num_buffer_args_; ++i) { + void* buf = args[static_cast(i)]; + [encoder setBuffer:(id)(buf) offset:0 atIndex:i]; + } + if (num_pack_args_ != 0) { + [encoder setBytes:pack_args + length:num_pack_args_ * sizeof(ArgUnion64) + atIndex:num_buffer_args_]; + } + // launch + MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); + MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2)); + [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock]; + [encoder endEncoding]; + [cb commit]; } - // launch - MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); - MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2)); - [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock]; - [encoder endEncoding]; - [cb commit]; } private: @@ -230,23 +236,27 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons PackedFunc MetalModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { - ICHECK_EQ(sptr_to_self.get(), this); - ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; - auto it = fmap_.find(name); - if (it == fmap_.end()) return PackedFunc(); - const FunctionInfo& info = it->second; - MetalWrappedFunc f; - size_t num_buffer_args = NumBufferArgs(info.arg_types); - f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args, - info.thread_axis_tags); - return PackFuncNonBufferArg(f, info.arg_types); + @autoreleasepool { + ICHECK_EQ(sptr_to_self.get(), this); + ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const FunctionInfo& info = it->second; + MetalWrappedFunc f; + size_t num_buffer_args = NumBufferArgs(info.arg_types); + f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args, + info.thread_axis_tags); + return PackFuncNonBufferArg(f, info.arg_types); + } } Module MetalModuleCreate(std::string data, std::string fmt, std::unordered_map fmap, std::string source) { - metal::MetalWorkspace::Global()->Init(); - auto n = make_object(data, fmt, fmap, source); - return Module(n); + @autoreleasepool { + metal::MetalWorkspace::Global()->Init(); + auto n = make_object(data, fmt, fmap, source); + return Module(n); + } } // Load module from module. From f88c2be21e3c268713f0772274ca206ed35da784 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 23 Mar 2021 09:50:37 -0700 Subject: [PATCH 60/84] [microTVM] Update nrfjprog on reference virtual machine (#7723) * update nrfjprog and integration test * merge * Revert "merge" This reverts commit 58d5d9187448e6580b6b780821eb2ea42ec34e8e. * fix comments * fix clang * revert format * new line * format --- apps/microtvm/reference-vm/base-box-tool.py | 28 +++++++++++++++---- .../reference-vm/zephyr/base-box/setup.sh | 15 ++++++---- .../zephyr/base-box/test-config.json | 14 ++++++++-- 3 files changed, 44 insertions(+), 13 deletions(-) mode change 100755 => 100644 apps/microtvm/reference-vm/base-box-tool.py diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py old mode 100755 new mode 100644 index 0e82dc2e9c0ee..dbf05f016f67a --- a/apps/microtvm/reference-vm/base-box-tool.py +++ b/apps/microtvm/reference-vm/base-box-tool.py @@ -42,6 +42,12 @@ "vmware_desktop", ) +# List of microTVM platforms for testing. +ALL_MICROTVM_PLATFORMS = ( + "stm32f746xx", + "nrf5340dk", +) + def parse_virtualbox_devices(): output = subprocess.check_output(["VBoxManage", "list", "usbhost"], encoding="utf-8") @@ -109,6 +115,7 @@ def attach_virtualbox(uuid, vid_hex=None, pid_hex=None, serial=None): if serial is not None: rule_args.extend(["--serialnumber", serial]) subprocess.check_call(rule_args) + # TODO(mehrdadh): skip usb attach if it's already attached subprocess.check_call(["VBoxManage", "controlvm", uuid, "usbattach", dev["UUID"]]) return @@ -308,13 +315,17 @@ def test_command(args): test_config_file = os.path.join(base_box_dir, "test-config.json") with open(test_config_file) as f: test_config = json.load(f) + + # select microTVM test platform + microtvm_test_platform = test_config[args.microtvm_platform] + for key, expected_type in REQUIRED_TEST_CONFIG_KEYS.items(): - assert key in test_config and isinstance( - test_config[key], expected_type + assert key in microtvm_test_platform and isinstance( + microtvm_test_platform[key], expected_type ), f"Expected key {key} of type {expected_type} in {test_config_file}: {test_config!r}" - test_config["vid_hex"] = test_config["vid_hex"].lower() - test_config["pid_hex"] = test_config["pid_hex"].lower() + microtvm_test_platform["vid_hex"] = microtvm_test_platform["vid_hex"].lower() + microtvm_test_platform["pid_hex"] = microtvm_test_platform["pid_hex"].lower() providers = args.provider provider_passed = {p: False for p in providers} @@ -331,7 +342,7 @@ def test_command(args): release_test_dir, user_box_dir, base_box_dir, provider_name ) do_run_release_test( - release_test_dir, provider_name, test_config, args.test_device_serial + release_test_dir, provider_name, microtvm_test_platform, args.test_device_serial ) provider_passed[provider_name] = True @@ -444,6 +455,13 @@ def parse_args(): ), ) + parser.add_argument( + "--microtvm-platform", + default="stm32f746xx", + choices=ALL_MICROTVM_PLATFORMS, + help="For use with 'test' command. MicroTVM platfrom that are used for testing.", + ) + return parser.parse_args() diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh index 52af947c3e89e..7299ceae6ad81 100644 --- a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh +++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh @@ -59,17 +59,22 @@ sudo apt install -y llvm sudo apt install -y protobuf-compiler libprotoc-dev # nrfjprog +NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz +NRF_COMMANDLINE_TOOLS_URL=https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-12-1/nRFCommandLineTools10121Linuxamd64.tar.gz +NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb +JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb + cd ~ mkdir -p nrfjprog -wget --no-verbose -O nRFCommandLineTools1090Linuxamd64.tar.gz https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-9-0/nRFCommandLineTools1090Linuxamd64tar.gz +wget --no-verbose -O $NRF_COMMANDLINE_TOOLS_FILE $NRF_COMMANDLINE_TOOLS_URL cd nrfjprog -tar -xzvf ../nRFCommandLineTools1090Linuxamd64.tar.gz -sudo apt install -y ./JLink_Linux_V680a_x86_64.deb -sudo apt install -y ./nRF-Command-Line-Tools_10_9_0_Linux-amd64.deb +tar -xzvf "../${NRF_COMMANDLINE_TOOLS_FILE}" +sudo apt install -y "./${JLINK_LINUX_INSTALLER}" +sudo apt install -y "./${NRF_COMMANDLINE_TOOLS_INSTALLER}" source ~/.profile nrfjprog --help cd .. -rm -rf nrfjprog nRFCommandLineTools1090Linuxamd64.tar.gz +rm -rf nrfjprog "${NRF_COMMANDLINE_TOOLS_FILE}" # Zephyr pip3 install --user -U west diff --git a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json index 78a6bd216e656..0d266797f04d7 100644 --- a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json +++ b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json @@ -1,4 +1,12 @@ -{"vid_hex": "0483", - "pid_hex": "374b", - "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"] +{ + "stm32f746xx": { + "vid_hex": "0483", + "pid_hex": "374b", + "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"] + }, + "nrf5340dk": { + "vid_hex": "1366", + "pid_hex": "1055", + "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=nrf5340dk"] + } } From 6f0a6561593898053cde051fbb4687eef3adec39 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Tue, 23 Mar 2021 13:47:53 -0700 Subject: [PATCH 61/84] [FIX] Fix temporary allocation size in threefry (#7709) * [FIX] Fix temporary allocation size in threefry * bump sizes --- python/tvm/topi/random/kernel.py | 2 +- tests/python/topi/python/test_topi_prng.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py index 728cd682fa426..a09a5f3f4ae32 100644 --- a/python/tvm/topi/random/kernel.py +++ b/python/tvm/topi/random/kernel.py @@ -141,7 +141,7 @@ def mix(a, b, rotation): return [x, y] # temporary buffer for holding the results of _PERMUTATIONS - tmp = irb.allocate(out_buf.dtype, out_shape, name="tmp", scope="global") + tmp = irb.allocate(out_buf.dtype, out_shape * nwords, name="tmp", scope="global") tmp_offset = 0 # Initialize entire key. It is composed of the original key with one diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py index 649e5410c147c..102e93f3b245c 100644 --- a/tests/python/topi/python/test_topi_prng.py +++ b/tests/python/topi/python/test_topi_prng.py @@ -87,9 +87,9 @@ def test_threefry_generate(target, ctx): gen = tvm.relay.random.threefry_key(0).data.asnumpy() # check that we can generate some data - a, rands = threefry_generate(target, ctx, gen, (100,)) + a, rands = threefry_generate(target, ctx, gen, (2048,)) assert ( - rands.shape[0] == 100 and len(rands.shape) == 1 + rands.shape[0] == 2048 and len(rands.shape) == 1 ), "Output shape should match requested shape" # check that gen out does not equal input @@ -99,13 +99,13 @@ def test_threefry_generate(target, ctx): gen = np.array( [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64" ) # make counter large - a, rands = threefry_generate(target, ctx, gen, (100,)) + a, rands = threefry_generate(target, ctx, gen, (2048,)) assert gen[4] != a[4], "Overflow of counter should trigger path change" - assert a[7] == 100, "Overflow of counter should still update counter" + assert a[7] == 2048, "Overflow of counter should still update counter" # check generate with path at length limit gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64") # make counter large - a, rands = threefry_generate(target, ctx, gen, (100,)) + a, rands = threefry_generate(target, ctx, gen, (2048,)) assert ( gen[0:4] != a[0:4] ).any(), "Overflowing counter with no space left in path should change state" From 813136401a11a49d6c15e6013c34dd822a5c4ff6 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Tue, 23 Mar 2021 20:40:32 -0600 Subject: [PATCH 62/84] [ONNX] Onnx node tests (#7720) * WIP * some fixes * more fixes * fix some conv_transpose tests * fix out of bounds slice * fix flatten import * fix logsoftmax and softmax tests * fix Error in Upsample * fix onehot * normalize errors * fix gather with negative indices * parameterize test * skip unsupported tests * clean up * fix rebase * fix lint * add an error message when we find an un-identified tensor --- python/tvm/relay/frontend/onnx.py | 133 +++++++++++++---- python/tvm/relay/op/transform.py | 7 +- tests/python/frontend/onnx/test_forward.py | 163 +++++++++++++++++++++ 3 files changed, 269 insertions(+), 34 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index fab4ae889dd7e..d9fc2ff99a765 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -103,10 +103,11 @@ def get_numpy(tensor_proto): def get_type(elem_type): """Converts onnx integer datatype to numpy datatype""" try: - from onnx import TensorProto + from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE except ImportError as e: raise ImportError("Unable to import onnx which is required {}".format(e)) - return TensorProto.DataType.Name(elem_type).lower() + + return str(TENSOR_TYPE_TO_NP_TYPE[elem_type]) def get_info(info_proto): @@ -157,7 +158,7 @@ def revert_caffe2_pad(pads): return pads -def get_pad_pair(input1d, kernel1d, stride1d): +def get_pad_pair(input1d, kernel1d, stride1d, mode): """infer pad size""" if input1d % stride1d == 0: pad = max(kernel1d - stride1d, 0) @@ -165,6 +166,8 @@ def get_pad_pair(input1d, kernel1d, stride1d): pad = max(kernel1d - (input1d % stride1d), 0) pad_before = pad // 2 pad_after = pad - pad_before + if "LOWER" in mode: + return [pad_after, pad_before] return [pad_before, pad_after] @@ -280,9 +283,9 @@ def _impl_v1(cls, inputs, attr, params): pad_tuple = [] for axis in range(len(input_shape) - 2): axis_shape = input_shape[2 + axis] - stride = attr["strides"][axis] + stride = attr.get("strides", [1] * ndim)[axis] kernel = attr["kernel_shape"][axis] - pad = get_pad_pair(axis_shape, kernel, stride) + pad = get_pad_pair(axis_shape, kernel, stride, attr["auto_pad"]) pad_tuple.append(pad) pad_tuple = tuple([val for pair in zip(*pad_tuple) for val in pair]) attr["pads"] = pad_tuple @@ -444,9 +447,15 @@ class ConvTranspose(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): # get number of channels - channels = infer_channels(inputs[1], True) + out_type = infer_type(inputs[1]) + out_shapes = [get_const_tuple(out_type.checked_type.shape)] + channels = out_shapes[0][1] attr["channels"] = channels groups = attr.get("group", 1) + + if "kernel_shape" not in attr: + attr["kernel_shape"] = out_shapes[0][2:] + attr["groups"] = groups # infer pads for auto_pad data = inputs[0] @@ -528,13 +537,11 @@ def _impl_v1(cls, inputs, attr, params): if not transB: inputs[1] = _op.transpose(inputs[1], axes=(1, 0)) inputs[0] = _op.nn.batch_flatten(inputs[0]) - if alpha != 1.0: inputs[0] *= _expr.const(alpha) out = _op.nn.dense(inputs[0], inputs[1], units=channels) - if len(inputs) == 3: - return _op.nn.bias_add(out, _expr.const(beta) * inputs[2]) + out = out + _expr.const(beta) * inputs[2] return out @@ -618,7 +625,7 @@ def _impl_v1(cls, inputs, attr, params): # Note: attr['fmod'] determines whether the operator should behave like np.fmod or np.mod. # attr['fmod'] == 0 will behave as np.mod and attr['fmod'] == 1 will force fmod treatment. # The relay equivalent of np.fmod is relay.mod and np.mod is relay.floor_mod - if attr["fmod"] == 0: + if attr.get("fmod", 0) == 0: op_name = "floor_mod" else: op_name = "mod" @@ -849,12 +856,18 @@ class Flatten(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): axis = attr.get("axis", 1) + ishape = _op.shape_of(inputs[0]) + ndim = infer_shape(ishape)[0] + if axis < 0: + axis = axis + ndim + if axis == 1: out = _op.nn.batch_flatten(inputs[0]) else: - newshape = [0] * (axis + 1) - newshape[axis] = -1 - out = _op.reshape(inputs[0], list(newshape)) + pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True) + post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True) + newshape = _op.concatenate([pre_shape, post_shape], axis=0) + out = _op.reshape(inputs[0], newshape) return out @@ -1036,7 +1049,7 @@ def _impl_v9(cls, inputs, attr, params): # in 3d case, we use the purely static op if dims == 5: - if isinstance(scales, _expr.Call): + if isinstance(scales, _expr.Expr): scale_h = _op.take(scales, _op.const(3)) scale_w = _op.take(scales, _op.const(4)) scale_d = _op.take(scales, _op.const(1)) @@ -1052,7 +1065,7 @@ def _impl_v9(cls, inputs, attr, params): ) # in 2d case, use dynamic op else: - if isinstance(scales, _expr.Call): + if isinstance(scales, _expr.Expr): scale_h = _op.take(scales, _op.const(3)) scale_w = _op.take(scales, _op.const(4)) else: @@ -1247,7 +1260,13 @@ class Gather(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): axis = attr.get("axis", 0) - return AttrCvt("take", extras={"axis": axis})(inputs, {}) + data = inputs[0] + indices = inputs[1] + ind_dtype = infer_type(indices).checked_type.dtype + # Normalize the indices to a positive range + s = _op.take(_op.shape_of(data, dtype=ind_dtype), _op.const(axis)) + indices = _op.where(indices < _op.const(0, ind_dtype), indices + s, indices) + return _op.take(data, indices, axis) class GatherElements(OnnxOpConverter): @@ -1258,6 +1277,10 @@ def _impl_v1(cls, inputs, attr, params): data = inputs[0] indices = inputs[1] axis = attr.get("axis", 0) + ind_dtype = infer_type(indices).checked_type.dtype + # Normalize the indices to a positive range + s = _op.take(_op.shape_of(data, dtype=ind_dtype), _op.const(axis)) + indices = _op.where(indices < _op.const(0, ind_dtype), indices + s, indices) return _op.gather(data, axis, indices) @@ -1318,8 +1341,8 @@ class Maximum(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2: - raise ValueError("Expect minimum 2 inputs") + if len(inputs) == 1: + return inputs[0] _max = inputs[0] for i in range(1, len(inputs)): _max = AttrCvt("maximum")([_max, inputs[i]], {}) @@ -1331,8 +1354,8 @@ class Minimum(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2: - raise ValueError("Expect minimum 2 inputs") + if len(inputs) == 1: + return inputs[0] _min = inputs[0] for i in range(1, len(inputs)): _min = AttrCvt("minimum")([_min, inputs[i]], {}) @@ -1344,8 +1367,8 @@ class Mean(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2: - raise ValueError("Expect minimum 2 inputs") + if len(inputs) == 1: + return inputs[0] # avoid overflow concat = _op.concatenate([_op.expand_dims(x, axis=0) for x in inputs], axis=0) return _op.mean(concat, axis=0, keepdims=False) @@ -1485,6 +1508,8 @@ class ArgMax(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): + if "select_last_index" in attr: + raise NotImplementedError("select_last_index not supported in ArgMax") axis = attr.get("axis", 0) keepdims = attr.get("keepdims", True) attr = {"axis": axis, "keepdims": keepdims} @@ -1496,6 +1521,8 @@ class ArgMin(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): + if "select_last_index" in attr: + raise NotImplementedError("select_last_index not supported in ArgMin") axis = attr.get("axis", 0) keepdims = attr.get("keepdims", True) attr = {"axis": axis, "keepdims": keepdims} @@ -1510,7 +1537,35 @@ def _impl_v1(cls, inputs, attr, params): # set default value when axis is not set in the model if "axis" not in attr: attr["axis"] = 1 - return AttrCvt("softmax", transforms={"axis": ("axis", 1)})(inputs, attr, params) + axis = attr["axis"] + ndim = len(infer_shape(inputs[0])) + if axis < 0: + axis += ndim + axes = list(range(axis, ndim)) + x = inputs[0] + m = _op.max(x, axes, keepdims=True) + e = _op.exp(x - m) + return e / _op.sum(e, axes, keepdims=True) + + +class LogSoftmax(OnnxOpConverter): + """Operator converter for Softmax.""" + + @classmethod + def _impl_v1(cls, inputs, attr, params): + # set default value when axis is not set in the model + if "axis" not in attr: + attr["axis"] = 1 + axis = attr["axis"] + ndim = len(infer_shape(inputs[0])) + if axis < 0: + axis += ndim + axes = list(range(axis, ndim)) + x = inputs[0] + m = _op.max(x, axes, keepdims=True) + e = _op.exp(x - m) + s = _op.sum(e, axes, keepdims=True) + return x - m - _op.log(s) class OneHot(OnnxOpConverter): @@ -1520,14 +1575,24 @@ class OneHot(OnnxOpConverter): def _impl_v9(cls, inputs, attr, params): # Extract relay one_hot inputs. indices, depth, values = inputs + ndim = len(infer_shape(indices)) # Split onnx on off values into two separate expressions. off_value, on_value = _op.take(values, _op.const(0)), _op.take(values, _op.const(1)) # Extract the datatype of the output from on_value. dtype = infer_type(on_value).checked_type.dtype + ind_dtype = infer_type(indices).checked_type.dtype + # Normalize the indices to a positive range + indices = _op.where( + indices < _op.const(0, ind_dtype), indices + _op.cast(depth, ind_dtype), indices + ) # set default value when axis is not set in the model if "axis" not in attr: attr["axis"] = -1 - return _op.one_hot(indices, on_value, off_value, depth, int(attr["axis"]), dtype=dtype) + axis = attr["axis"] + if axis < 0: + axis += ndim + 1 + + return _op.one_hot(indices, on_value, off_value, depth, axis, dtype=dtype) class ConstantOfShape(OnnxOpConverter): @@ -1552,7 +1617,7 @@ class Constant(OnnxOpConverter): @classmethod def _impl_v9(cls, inputs, attr, params): if "value" not in attr: - raise "No Value in Constant" + raise tvm.errors.OpAttributeRequired("no value in Constant") np_value = get_numpy(attr.pop("value")) dtype = np_value.dtype.name value = _expr.const(np_value, dtype) @@ -2042,7 +2107,7 @@ def _impl_v1(cls, inputs, attr, params): largest = attr.get("largest", 1) if largest == 0: - raise ValueError("TVM only supports finding TopK largest elements") + raise NotImplementedError("TVM only supports finding TopK largest elements") return _op.topk(inputs[0], inputs[1], axis=axis, dtype="int64") @@ -2087,7 +2152,7 @@ def _impl_v1(cls, inputs, attr, params): batch_indices = inputs[2] mode = attr.get("mode", b"avg") if mode not in (b"avg", b"max"): - raise ValueError("RoiAlign in Relay only uses avg and max modes") + raise NotImplementedError("RoiAlign in Relay only uses avg and max modes") output_height = attr.get("output_height", 1) output_width = attr.get("output_width", 1) @@ -2128,7 +2193,8 @@ def _impl_v11(cls, inputs, attr, params): result = inputs[0] for i, op in enumerate([_op.tensor.maximum, _op.tensor.minimum]): if i < len(inputs) - 1: - result = op(result, inputs[i + 1]) + if inputs[i + 1] is not None: + result = op(result, inputs[i + 1]) return result @@ -2393,9 +2459,10 @@ def _impl_v10(cls, inputs, attr, params): dtype = infer_type(boxes).checked_type.dtype if "center_point_box" in attr: - assert ( - attr["center_point_box"] == 0 - ), "Only support center_point_box = 0 in onnx importer right now" + if attr["center_point_box"] != 0: + raise NotImplementedError( + "Only support center_point_box = 0 in ONNX NonMaxSuprresion" + ) if iou_threshold is None: iou_threshold = _expr.const(0.0, dtype="float32") @@ -2718,7 +2785,7 @@ def _get_convert_map(opset): "Softplus": Softplus.get_converter(opset), # softmax default axis is different in onnx "Softmax": Softmax.get_converter(opset), - "LogSoftmax": AttrCvt("log_softmax", {"axis": ("axis", 1)}), + "LogSoftmax": LogSoftmax.get_converter(opset), "OneHot": OneHot.get_converter(opset), # 'Hardmax' "Softsign": Softsign.get_converter(opset), @@ -2958,6 +3025,8 @@ def from_onnx(self, graph, opset, get_output_expr=False): for i in node.input: if i != "": inputs[i] = self._nodes[self._renames.get(i, i)] + else: + inputs[i] = None i_name = self._parse_value_proto(node) node_output = self._fix_outputs(op_name, node.output) attr["tvm_custom"] = {} diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index 4129b610cb7c3..df0ae767460a9 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -905,10 +905,13 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"): end = const(list(end)) if isinstance(strides, (tuple, list)): strides = const(list(strides)) - normalized_begin = _make.where( + begin = _make.where( begin < cast_like(const(0), begin), begin + cast_like(shape_of(data), begin), begin ) - return _dyn_make.strided_slice(data, normalized_begin, end, strides, slice_mode) + begin = _make.where( + begin >= cast_like(shape_of(data), begin), cast_like(shape_of(data), begin), begin + ) + return _dyn_make.strided_slice(data, begin, end, strides, slice_mode) return _make.strided_slice(data, begin, end, strides, slice_mode) diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 5a6216ac705de..ec89a3d844d10 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -4090,6 +4090,169 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"): verify_cumsum(data, 1, 1, 1, type="int32") +from onnx import numpy_helper + +f = onnx.__file__ +import glob + +onnx_test_folders = sorted(glob.glob("/".join(f.split("/")[0:-1]) + "/backend/test/data/node/*/")) + +unsupported_onnx_tests = [ + "test_basic_convinteger/", + "test_bitshift_left_uint16/", + "test_bitshift_left_uint32/", + "test_bitshift_left_uint64/", + "test_bitshift_left_uint8/", + "test_bitshift_right_uint16/", + "test_bitshift_right_uint32/", + "test_bitshift_right_uint64/", + "test_bitshift_right_uint8/", + "test_cast_DOUBLE_to_FLOAT16/", + "test_cast_FLOAT16_to_DOUBLE/", + "test_cast_FLOAT16_to_FLOAT/", + "test_cast_FLOAT_to_FLOAT16/", + "test_cast_FLOAT_to_STRING/", + "test_cast_STRING_to_FLOAT/", + "test_compress_0/", + "test_compress_1/", + "test_compress_default_axis/", + "test_compress_negative_axis/", + "test_convinteger_with_padding/", + "test_convtranspose_dilations/", + "test_convtranspose_output_shape/", + "test_cumsum_1d/", + "test_cumsum_1d_exclusive/", + "test_cumsum_1d_reverse/", + "test_cumsum_1d_reverse_exclusive/", + "test_cumsum_2d_axis_0/", + "test_cumsum_2d_axis_1/", + "test_cumsum_2d_negative_axis/", + "test_dequantizelinear/", + "test_det_2d/", + "test_det_nd/", + "test_dynamicquantizelinear/", + "test_dynamicquantizelinear_expanded/", + "test_dynamicquantizelinear_max_adjusted/", + "test_dynamicquantizelinear_max_adjusted_expanded/", + "test_dynamicquantizelinear_min_adjusted/", + "test_dynamicquantizelinear_min_adjusted_expanded/", + "test_eyelike_populate_off_main_diagonal/", + "test_eyelike_with_dtype/", + "test_eyelike_without_dtype/", + "test_hardmax_axis_0/", + "test_hardmax_axis_1/", + "test_hardmax_axis_2/", + "test_hardmax_default_axis/", + "test_hardmax_example/", + "test_hardmax_negative_axis/", + "test_hardmax_one_hot/", + "test_isinf_negative/", + "test_isinf_positive/", + "test_lstm_defaults/", + "test_lstm_with_initial_bias/", + "test_lstm_with_peepholes/", + "test_matmulinteger/", + "test_maxpool_2d_dilations/", + "test_maxpool_2d_same_lower/", + "test_maxpool_2d_same_upper/", + "test_maxpool_with_argmax_2d_precomputed_pads/", + "test_maxpool_with_argmax_2d_precomputed_strides/", + "test_maxunpool_export_with_output_shape/", + "test_mvn/", + "test_nonmaxsuppression_center_point_box_format/", + "test_qlinearconv/", + "test_qlinearmatmul_2D/", + "test_qlinearmatmul_3D/", + "test_quantizelinear/", + "test_range_float_type_positive_delta_expanded/", + "test_range_int32_type_negative_delta_expanded/", + "test_resize_downsample_scales_cubic/", + "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside/", + "test_resize_downsample_scales_cubic_align_corners/", + "test_resize_downsample_scales_linear/", + "test_resize_downsample_scales_nearest/", + "test_resize_downsample_sizes_cubic/", + "test_resize_downsample_sizes_linear_pytorch_half_pixel/", + "test_resize_downsample_sizes_nearest/", + "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn/", + "test_resize_tf_crop_and_resize/", + "test_resize_upsample_scales_cubic/", + "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside/", + "test_resize_upsample_scales_cubic_align_corners/", + "test_resize_upsample_scales_cubic_asymmetric/", + "test_resize_upsample_scales_linear/", + "test_resize_upsample_sizes_cubic/", + "test_resize_upsample_sizes_nearest_ceil_half_pixel/", + "test_resize_upsample_sizes_nearest_floor_align_corners/", + "test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric/", + "test_reversesequence_batch/", + "test_reversesequence_time/", + "test_rnn_seq_length/", + "test_roialign/", + "test_round/", + "test_scan9_sum/", + "test_scan_sum/", + "test_scatternd/", + "test_selu_default/", + "test_shrink_hard/", + "test_shrink_soft/", + "test_simple_rnn_defaults/", + "test_simple_rnn_with_initial_bias/", + "test_slice_neg_steps/", + "test_slice_start_out_of_bounds/", + "test_strnormalizer_export_monday_casesensintive_lower/", + "test_strnormalizer_export_monday_casesensintive_nochangecase/", + "test_strnormalizer_export_monday_casesensintive_upper/", + "test_strnormalizer_export_monday_empty_output/", + "test_strnormalizer_export_monday_insensintive_upper_twodim/", + "test_strnormalizer_nostopwords_nochangecase/", + "test_tfidfvectorizer_tf_batch_onlybigrams_skip0/", + "test_tfidfvectorizer_tf_batch_onlybigrams_skip5/", + "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5/", + "test_tfidfvectorizer_tf_only_bigrams_skip0/", + "test_tfidfvectorizer_tf_onlybigrams_levelempty/", + "test_tfidfvectorizer_tf_onlybigrams_skip5/", + "test_tfidfvectorizer_tf_uniandbigrams_skip5/", + "test_top_k_smallest/", + "test_unique_not_sorted_without_axis/", + "test_unique_sorted_with_axis/", + "test_unique_sorted_with_axis_3d/", + "test_unique_sorted_with_negative_axis/", + "test_unique_sorted_without_axis/", + "test_unsqueeze_unsorted_axes/", + "test_upsample_nearest/", +] + + +@pytest.mark.parametrize("test", onnx_test_folders) +def test_onnx_nodes(test): + for failure in unsupported_onnx_tests: + if failure in test: + pytest.skip() + break + onnx_model = onnx.load(test + "/model.onnx") + inputs = [] + outputs = [] + for dataset in glob.glob(test + "/*/"): + tensors = sorted(glob.glob(dataset + "/*.pb")) + for tensor in tensors: + new_tensor = onnx.TensorProto() + with open(tensor, "rb") as f: + new_tensor.ParseFromString(f.read()) + if "input" in tensor.split("/")[-1]: + inputs.append(numpy_helper.to_array(new_tensor)) + elif "output" in tensor.split("/")[-1]: + outputs.append(numpy_helper.to_array(new_tensor)) + else: + raise ImportError(str(tensor) + " not labeled as an import or an output") + tvm_val = get_tvm_output_with_vm(onnx_model, inputs, "llvm", tvm.cpu(0)) + if len(outputs) == 1: + tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=1e-5, atol=1e-5) + else: + for output, val in zip(outputs, tvm_val): + tvm.testing.assert_allclose(output, val, rtol=1e-5, atol=1e-5) + + def test_wrong_input(): node = helper.make_node( "Softplus", From 1fe0abc21120de81522fac2f8135cde1b99d4c05 Mon Sep 17 00:00:00 2001 From: CircleSpin <2keepconnected@gmail.com> Date: Wed, 24 Mar 2021 11:59:59 -0400 Subject: [PATCH 63/84] [TVMC] Python Scripting Init Files (#7698) * add to init files for clean tvmc python * black reformat init.py * adjust tests to new imports * black test files * tell lint ignore defined-builtin error for tvmc compile * add colon to match lint syntax * change import so must use tvm.driver.tvmc instead of tvm.tvmc Co-authored-by: Jocelyn --- python/tvm/driver/tvmc/__init__.py | 4 ++++ tests/python/driver/tvmc/test_compiler.py | 20 +++++++++----------- tests/python/driver/tvmc/test_frontends.py | 18 ++++++++---------- tests/python/driver/tvmc/test_runner.py | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/python/tvm/driver/tvmc/__init__.py b/python/tvm/driver/tvmc/__init__.py index d96a725877ebd..d9c15792349aa 100644 --- a/python/tvm/driver/tvmc/__init__.py +++ b/python/tvm/driver/tvmc/__init__.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=redefined-builtin """ TVMC - TVM driver command-line interface """ @@ -21,3 +22,6 @@ from . import autotuner from . import compiler from . import runner +from .frontends import load_model as load +from .compiler import compile_model as compile +from .runner import run_module as run diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py index ae859298facde..0180c35d7a267 100644 --- a/tests/python/driver/tvmc/test_compiler.py +++ b/tests/python/driver/tvmc/test_compiler.py @@ -45,7 +45,7 @@ def test_save_dumps(tmpdir_factory): def verify_compile_tflite_module(model, shape_dict=None): pytest.importorskip("tflite") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( model, target="llvm", dump_code="ll", alter_layout="NCHW", shape_dict=shape_dict ) @@ -74,7 +74,7 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant): def test_cross_compile_aarch64_tflite_module(tflite_mobilenet_v1_1_quant): pytest.importorskip("tflite") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( tflite_mobilenet_v1_1_quant, target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'", dump_code="asm", @@ -91,9 +91,7 @@ def test_compile_keras__save_module(keras_resnet50, tmpdir_factory): # some CI environments wont offer tensorflow/Keras, so skip in case it is not present pytest.importorskip("tensorflow") - graph, lib, params, dumps = tvmc.compiler.compile_model( - keras_resnet50, target="llvm", dump_code="ll" - ) + graph, lib, params, dumps = tvmc.compile(keras_resnet50, target="llvm", dump_code="ll") expected_temp_dir = tmpdir_factory.mktemp("saved_output") expected_file_name = "saved.tar" @@ -111,7 +109,7 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50): # some CI environments wont offer tensorflow/Keras, so skip in case it is not present pytest.importorskip("tensorflow") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( keras_resnet50, target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'", dump_code="asm", @@ -129,7 +127,7 @@ def verify_compile_onnx_module(model, shape_dict=None): # some CI environments wont offer onnx, so skip in case it is not present pytest.importorskip("onnx") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( model, target="llvm", dump_code="ll", shape_dict=shape_dict ) @@ -158,7 +156,7 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50): # some CI environments wont offer onnx, so skip in case it is not present pytest.importorskip("onnx") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( onnx_resnet50, target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon", dump_code="asm", @@ -176,7 +174,7 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50): def test_compile_opencl(tflite_mobilenet_v1_0_25_128): pytest.importorskip("tflite") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( tflite_mobilenet_v1_0_25_128, target="opencl", target_host="llvm", @@ -197,7 +195,7 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128): def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant): pytest.importorskip("tflite") - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( tflite_mobilenet_v1_1_quant, target="ethos-n77, llvm", dump_code="relay" ) @@ -221,7 +219,7 @@ def test_compile_check_configs_composite_target(mock_pc, mock_fe, mock_ct, mock_ mock_ct.return_value = mock_codegen mock_relay.return_value = mock.MagicMock() - graph, lib, params, dumps = tvmc.compiler.compile_model( + graph, lib, params, dumps = tvmc.compile( "no_file_needed", target="mockcodegen -testopt=value, llvm" ) diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index 5a63c5c479332..3da63d43ef29c 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -93,7 +93,7 @@ def test_load_model__invalid_path__no_language(): pytest.importorskip("tflite") with pytest.raises(FileNotFoundError): - tvmc.frontends.load_model("not/a/file.tflite") + tvmc.load("not/a/file.tflite") def test_load_model__invalid_path__with_language(): @@ -101,14 +101,14 @@ def test_load_model__invalid_path__with_language(): pytest.importorskip("onnx") with pytest.raises(FileNotFoundError): - tvmc.frontends.load_model("not/a/file.txt", model_format="onnx") + tvmc.load("not/a/file.txt", model_format="onnx") def test_load_model__tflite(tflite_mobilenet_v1_1_quant): # some CI environments wont offer TFLite, so skip in case it is not present pytest.importorskip("tflite") - mod, params = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant) + mod, params = tvmc.load(tflite_mobilenet_v1_1_quant) assert type(mod) is IRModule assert type(params) is dict # check whether one known value is part of the params dict @@ -149,7 +149,7 @@ def test_load_model__pb(pb_mobilenet_v1_1_quant): # some CI environments wont offer TensorFlow, so skip in case it is not present pytest.importorskip("tensorflow") - mod, params = tvmc.frontends.load_model(pb_mobilenet_v1_1_quant) + mod, params = tvmc.load(pb_mobilenet_v1_1_quant) assert type(mod) is IRModule assert type(params) is dict # check whether one known value is part of the params dict @@ -161,7 +161,7 @@ def test_load_model___wrong_language__to_keras(tflite_mobilenet_v1_1_quant): pytest.importorskip("tensorflow") with pytest.raises(OSError): - tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="keras") + tvmc.load(tflite_mobilenet_v1_1_quant, model_format="keras") def test_load_model___wrong_language__to_tflite(keras_resnet50): @@ -179,7 +179,7 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant): from google.protobuf.message import DecodeError with pytest.raises(DecodeError): - tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="onnx") + tvmc.load(tflite_mobilenet_v1_1_quant, model_format="onnx") @pytest.mark.skip(reason="https://github.com/apache/tvm/issues/7455") @@ -188,9 +188,7 @@ def test_load_model__pth(pytorch_resnet18): pytest.importorskip("torch") pytest.importorskip("torchvision") - mod, params = tvmc.frontends.load_model( - pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]} - ) + mod, params = tvmc.load(pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]}) assert type(mod) is IRModule assert type(params) is dict # check whether one known value is part of the params dict @@ -202,7 +200,7 @@ def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant): pytest.importorskip("torch") with pytest.raises(RuntimeError) as e: - tvmc.frontends.load_model( + tvmc.load( tflite_mobilenet_v1_1_quant, model_format="pytorch", shape_dict={"input": [1, 3, 224, 224]}, diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py index 544ed9f7e9df4..5fdf58fa8d642 100644 --- a/tests/python/driver/tvmc/test_runner.py +++ b/tests/python/driver/tvmc/test_runner.py @@ -73,7 +73,7 @@ def test_run_tflite_module__with_profile__valid_input( # some CI environments wont offer TFLite, so skip in case it is not present pytest.importorskip("tflite") - outputs, times = tvmc.runner.run_module( + outputs, times = tvmc.run( tflite_compiled_module_as_tarfile, inputs_file=imagenet_cat, hostname=None, From 63d8e97dfbe046e70c91c72cbbf7da8646824217 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Wed, 24 Mar 2021 10:08:27 -0700 Subject: [PATCH 64/84] =?UTF-8?q?[=C2=B5TVM]=20Rev=20ci-qemu=20to=200.02?= =?UTF-8?q?=20(Introduce=20onnx=20python=20dependency)=20(#7728)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix ci-qemu build, add ONNX * rev ci-qemu to staging --- Jenkinsfile | 2 +- docker/Dockerfile.ci_qemu | 4 ++ docker/install/ubuntu_install_zephyr.sh | 69 +------------------------ 3 files changed, 7 insertions(+), 68 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 506dcab4e3066..8f11bbaa568d2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,7 +49,7 @@ ci_gpu = "tlcpack/ci-gpu:v0.72" ci_cpu = "tlcpack/ci-cpu:v0.72-t0" ci_wasm = "tlcpack/ci-wasm:v0.70" ci_i386 = "tlcpack/ci-i386:v0.72-t0" -ci_qemu = "tlcpack/ci-qemu:v0.01" +ci_qemu = "tlcpack/ci-qemu:v0.02" ci_arm = "tlcpack/ci-arm:v0.02" // <--- End of regex-scanned config. diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu index 9120f91cc69f6..04434b7186b07 100644 --- a/docker/Dockerfile.ci_qemu +++ b/docker/Dockerfile.ci_qemu @@ -64,3 +64,7 @@ RUN bash /install/ubuntu_install_qemu.sh COPY install/ubuntu_install_zephyr.sh /install/ubuntu_install_zephyr.sh RUN bash /install/ubuntu_install_zephyr.sh ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr + +# Install ONNX +COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh +RUN bash /install/ubuntu_install_onnx.sh diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh index e10ff48212d15..1654ab9d1c095 100644 --- a/docker/install/ubuntu_install_zephyr.sh +++ b/docker/install/ubuntu_install_zephyr.sh @@ -33,73 +33,8 @@ sudo apt-get install -y --no-install-recommends \ python3-dev python3-pip python3-setuptools python3-tk python3-wheel python3-venv \ xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https -cat </dev/null ------BEGIN PGP PUBLIC KEY BLOCK----- - -mQINBF0bjnMBEADVgQr04Lg258KpWi42rzGemFGkzHCx7SXDWVqHApx34HUxF63s -RnknCTt42Thqcv78CJ9WQYjjvT5+FZOlxA+0kwkeatFoKNeVvBkyYFgU6gxSuVQ+ -a1ZEw2IYdqRH+vUC1AKGY88KlrteTAqtqYsaGimiF5ry3y3bLBySyxLHfltCaENy -uKPJEHHvHxTZsZAD3iwVysNZkw2V/V4IS8wy8m9rq1U7OU40KMJ3EUan89DzD1qt -8sroEThsjE9IG6QMf1H9pvNIIz/QhwqSKQkGqt8obdf0W+EB4cef6ka98a+E6slc -Otw2AVB2B47ljnp5AyLwZPiYxeIXPZsO8cZbx1uBOkOZ1OkqHlk4tgJEqg+v6APO -cm625fk4iftsB+U/3MZvm4QH4Y5xfAFb3aDL2zkxN/EUCWW5tUn+Z+RaegGaojTE -N2laH91ncpeZh1M9GPvXGT/efDg3a/Nv9UNUtv9lhNn35VyVgBNaaYwNScq5+ApV -pG8b/j18x8mQR8kk7bXvOXjc/4NdCrY7QcIExA9DTWemLsDVeDM62lBvOKZGED4X -fgGehGGPtu862kf4vvCZKrrEeVkVBrTiOsxFMdHshnKqtQyyJQKXXVjl9//jhMGM -cZHJ5+D9O4JNE/aZC4h2F7hL0NpO0AVGJ0Ly5N7B07yMBZGGJaH4QXCoHwARAQAB -tEVLaXR3YXJlIEFwdCBBcmNoaXZlIEF1dG9tYXRpYyBTaWduaW5nIEtleSAoMjAy -MCkgPGRlYmlhbkBraXR3YXJlLmNvbT6JAlQEEwEKAD4WIQRtkDmVQkqDpI1C1T2o -5e86AmACaAUCXRuOcwIbAwUJBaOagAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK -CRCo5e86AmACaLJ9D/9ly840Ko3F0HgIAAxAeWE7BzQOD09BbnL/is6F0lquXd/W -fZXUbVhONv7Q3FK9IDwzKoYHmRrwo6IpDIsy7AqiHHkWWxCdpIzVWQfE7rFg4UWa -2bNXoFBGRImYmQHaG/02EJiNnTDnsYgN7y9zzAAvz63dnSsm6GOUp9pkIoxHnt9D -WxMlM05GgVRjSeNvi4OLuPE6jHhHvAGGrMS8g9oU6TtCj9WVNryFpROchdmTteS4 -P16FP4n5NczXjYXFch3S+cOfijHnsfuFzB4JanrZ+JlBd21BDfhO/VLFx8+Ljdj0 -axKpwa86oHc5ALnMHPnGM2EVN+NNS88PDBngvJEpRUkECpEy4cwZ3zjCJ0jMeiRv -cFf/FjZBFeqrAapwWNFjIH0El7dJq+XYDKuA0kakMDo6GZlfTNDRobGj9vR/HA0j -/a7VD9tfW2dLr61qsQwynn6S+9B4XY/fYwc4AyYCp+FNm4ONFFjQ6ytCgdLdBEWK -X+xCMifTqDx9nm/1u/95ZqwcayAqwhKDb01hQhSTlozybz8B4trfeHJdXYoH7/s4 -TLnt5R68bc2Fm0ikk4tndSTH4SUtnEeIv+nap5RkCmHI6URJ4P4kFT3C30Ooeafv -GOa18HYxhb/qnU2DvWXL1rnKoKB51p2nhrkjliDPSfMMIf6AgyZSZR4BpMoz47kC -DQRdG49RARAAyX+HK4Xh0RiiqPd0DDbgHV+8UvY1gihObyK/cqpRQzewSKEw/jwM -abwav3oqisI7IFp4FmupqhSi7uqB54eUF44LHeGZiUedZm5pAreX2ygQASr2It3g -kWr58J0ZOas6cRqUzga1mCL1eljfff9T9+1syIWiUWTjXDzEwsMgksHIn9ZGaxM0 -zvkbXfTCmlzmCbvBwokHRrw9cvmXZIKaGdvAEg/S9asmkRBeA/0GgX2Tlr1H6mv5 -0ZYF25t0n2IYiVuvXTOrz9OCuWxv0NQiweMFTi62sN6myjB4PC499ySTQkIhWVsf -2oa5+rvcCg6j3jpUFy4MoDA9cXl47/0ccpim+mwJo4uY4ysIsDq7mKqjN7honj45 -zosvs7yd2UXrKWKay+P1e1vdsSOVP8PSSJCJV8HvdKCRfcYYdlTq3PSeloVrWC/4 -PCKlnp16AzPzL+CBWtj5ruhAFTaoKveEjUnQD5IiKD4hvt9nnX6C9RT2yhKDHnoi -uup+nXOEn78UWxmoPJKu5wE1c5ZZhw81bYByEkLjHt5Bl+FS8CJN3G+56kVuBc9v -Kqa90EThcLr6bIEx3LU3mK3FBxBCh+7xEPYI4Cx/NGyrszirRkzIIM6wIxc983l5 -+BtdKn14b2yDhfw2wOBsxo4aLWhGzqJGQAxuUo4sdNbElT5mpwpDxJEAEQEAAYkE -cgQYAQoAJhYhBG2QOZVCSoOkjULVPajl7zoCYAJoBQJdG49RAhsCBQkFo5qAAkAJ -EKjl7zoCYAJowXQgBBkBCgAdFiEEWbJ5MHJjJForbwaGKR+f9v04V4MFAl0bj1EA -CgkQKR+f9v04V4Mz9Q/9E0KmNCJC95HfP46enwASVnBZ7ntlHvtqQgNVZ8r0W69v -qg+FdsKK2109lR3RvRe5TAwHi4ryFW6YazmvH4k7Bd1pGxAtc5VSuehgs8lPGObo -SKI8S9EH+v3G4IAm25vaRDtnVdDpcfn5A6RrSDyTTDjdhyTp9w/f62SfMryf/0fv -yg5HS5JQSHBJdxN6mnRDqM66Ey4plfFbt4yKJIPnj5xsa19wx72Zw8hED1O6FZAV -URQ8ffE521R9wzQAfX3746pdEQ+S21Ht0lEsNjmU/HDq0WeOBElIN6S09XQyL0zG -0HrOZkByI5683v+cp6clJKxnBX7hsR0+4AxQK0+eNJEtLwLPcwObBi2ACeUG49cA -ms+BaSAvjbyCW4M7ye42zdEFbWS9hfK6T5Ry85Pv6IxgpUHAX7kvtqHxguDobuUZ -4CmSdRyBCEAN7dgjWrqrtmq7cF3Kwz5kLwzB0AeQTArLoYlBSlcx/eT/jDLZdFjQ -Ol6uqVdv63BADNriYExz++g4A02LzAfk+C0J/7syKeEs5nonIFwTfrS7VJbcs7Cn -8HkuCPuH9u1nYSJV8U7xYNCbRK3JNBr20IlO+TXAuf7M3z5IuZjED7EtG0kMyl41 -vbBYCFbKMpEEjFAUUO5CsbyL4IoYJRptJij10RsDI9jRY+YfOQ+WxP4txPDv1Eei -eBAAs2PDWG7MvubB1wE3QcRUEQqvDbEIdvRfz9YIOXfGlaDfiuhBpcxsgsDG/IjQ -3c0PnJqpLpivfOMMyfynwPRW4ZiwIUSrOYJ6xhOt3zUzqf/GfIB39pCz3AI0EBxp -uicL4PJ4OeA0V3XT+IEcjbqBaVz5UCS/sVuYTykxwk8BPYaJOFlHtp4kEtn43kpL -kQHPMQCC1+skI85d0YG7Yn1w5qSqtwYJBPFU2OWpyLHtxL55S8dAWmvlkKmA1I6W -WyOPM/Y5WWdG8BUphXmv67wdeVdxp4s5V8oXKy3QQ0FA5Wt/z6l7Ei8tXcOIgDYw -nYgTgjOprZPXOY+L+6gED3YVWUvAJ6xhdYVsJazu3Ulwr4dwkHrBd1qXe7NGA3Ib -7VAkzkPzRtdPJ+OT/YX0vfh3a4VvYepoTAHIf0J6Uo2vcqBFA/Ztiby3bM4T4C30 -c5AqQkLDZ/2UbBW9Yu4f9oiw7/gDdNI7C8xHaQNLFzzRzhjnEpjwBhlpeballXoU -6ShFo6T0CzZ1N46iumJ5nTor40dY2EcX+dXxGCJ2ihifIeHrbx6fKFOB9VLV3VpW -SzLJTT9ARIgvqVg5lhTFiKRiZNp5MAu9NFw5wgyCJxUjASLOWshMwkhKHHe13AZD -2Hxmkp7Qwjg6kihr/j03NQIBhOK+M068Urew/dbndYwIzsI= -=0GnF ------END PGP PUBLIC KEY BLOCK----- - -EOF +wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc +sudo apt-key add kitware-archive-latest.asc sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' sudo apt-get update From cfe2e288a331b10e72e10c7e465df375b44e6ae9 Mon Sep 17 00:00:00 2001 From: Rafael Stahl Date: Wed, 24 Mar 2021 19:27:16 +0100 Subject: [PATCH 65/84] [crt] fix heap corruption from bad allocation (#7735) The type of runtime->storage_pool was changed at some point from TVMNDArray to TVMGraphRuntimeStorageEntry. This change was not reflected in the call to the allocation for its buffer. If this unclaimed space is allocated to something else, data corruption will happen. --- src/runtime/crt/graph_runtime/graph_runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c index 21b72f0e400c0..a79be56636598 100644 --- a/src/runtime/crt/graph_runtime/graph_runtime.c +++ b/src/runtime/crt/graph_runtime/graph_runtime.c @@ -967,7 +967,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } // Allocate the space. - err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * pool_entry_count, alloc_ctx, + err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeStorageEntry) * pool_entry_count, alloc_ctx, (void**)&runtime->storage_pool); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); From 3ba586803ac7956813177aebf8072e7d7c0ab9b2 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Thu, 25 Mar 2021 04:44:01 +0800 Subject: [PATCH 66/84] [TensorIR] Fix parser autocompletion mode (#7737) Co-authored-by: Ruihang Lai --- src/tir/ir/script/script_complete.cc | 26 ++- .../unittest/test_tvmscript_complete.py | 174 ++++++++++++++++++ .../unittest/test_tvmscript_roundtrip.py | 11 +- 3 files changed, 201 insertions(+), 10 deletions(-) create mode 100644 tests/python/unittest/test_tvmscript_complete.py diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc index 7c9fff724e333..a42b5ea5b3a07 100644 --- a/src/tir/ir/script/script_complete.cc +++ b/src/tir/ir/script/script_complete.cc @@ -36,12 +36,15 @@ namespace tir { /*! \brief Generate surrounding loops automatically */ class ScriptCompleter : public StmtMutator { public: - explicit ScriptCompleter(Map* buffer_var_map) : buffer_var_map_(buffer_var_map) {} + explicit ScriptCompleter(Map* buffer_var_map, bool contain_root) + : buffer_var_map_(buffer_var_map), contain_root_(contain_root) {} /*! \brief Whether the stmt contains at least one block. */ bool contains_block = false; private: Map* buffer_var_map_; + bool contain_root_; + bool visited_root_ = false; Stmt VisitStmt_(const BlockRealizeNode* op) override { contains_block = true; Stmt body = StmtMutator::VisitStmt_(op); @@ -62,6 +65,8 @@ class ScriptCompleter : public StmtMutator { } Stmt VisitStmt_(const BlockNode* op) override { + bool is_root_block = contain_root_ && !visited_root_; + visited_root_ = true; // Buffers allocated in the block can be accessed by its body. for (const auto& alloc_buffer : op->alloc_buffers) { buffer_var_map_->Set(alloc_buffer->data, alloc_buffer); @@ -71,7 +76,15 @@ class ScriptCompleter : public StmtMutator { for (const auto& alloc_buffer : op->alloc_buffers) { buffer_var_map_->erase(alloc_buffer->data); } + // ignore root block or blocks which already has reads/writes regions if (block->reads.empty() || block->writes.empty()) { + if (op->iter_vars.empty()) { + // non-root opaque block is not allowed + CHECK(is_root_block) + << "ValueError: Can not auto detect buffer access region for an opaque block. Please " + "annotate the access region manually."; + return std::move(block); + } auto access_region = GetBlockAccessRegion(block, *buffer_var_map_); const Array& reads = access_region[0]; const Array& writes = access_region[1]; @@ -80,8 +93,8 @@ class ScriptCompleter : public StmtMutator { << "ValueError: Can not auto detect buffer access region from tir.Load, tir.Store or " "direct access by buffer data. Please annotation the access region manually"; auto n = CopyOnWrite(block.operator->()); - if (!n->reads.defined()) n->reads = reads; - if (!n->writes.defined()) n->writes = writes; + if (n->reads.empty()) n->reads = reads; + if (n->writes.empty()) n->writes = writes; return Block(n); } else { return std::move(block); @@ -98,12 +111,13 @@ PrimFunc ScriptComplete(PrimFunc func, const Array& root_allocates) { for (const auto& alloc : root_allocates) { buffer_var_map.Set(alloc->data, alloc); } - ScriptCompleter script_completer(&buffer_var_map); + bool contain_root = root_allocates.empty() && func->body->IsInstance() && + Downcast(func->body)->block->iter_vars.empty(); + ScriptCompleter script_completer(&buffer_var_map, contain_root); // generate surrounding loops automatically Stmt res = script_completer(func->body); // generate root block automatically - if (script_completer.contains_block && - (!res->IsInstance() || !root_allocates.empty())) { + if (script_completer.contains_block && !contain_root) { res = Block({}, {}, {}, "root", res, NullOpt, root_allocates); res = BlockRealize({}, Bool(true), Downcast(res)); } diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py new file mode 100644 index 0000000000000..012ccc4b86285 --- /dev/null +++ b/tests/python/unittest/test_tvmscript_complete.py @@ -0,0 +1,174 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import tir +from tvm.ir import Range +from tvm.script import ty, from_source +from tvm.ir.diagnostics import override_renderer + + +@tvm.script.tir +def matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]: + with tir.init(): + C[vi, vj] = tir.float32(0) + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk] + + +@tvm.script.tir +def matmul_original(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + for i, j in tir.grid(32, 32): + with tir.block([32, 32], "init") as [vi, vj]: + for ii, jj in tir.grid(4, 4): + C[vi * 4 + ii, vj * 4 + jj] = tir.float32(0) + + for k in range(0, 32): + with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]: + for ii, jj, kk in tir.grid(4, 4, 4): + C[vi * 4 + ii, vj * 4 + jj] = ( + C[vi * 4 + ii, vj * 4 + jj] + + A[vi * 4 + ii, vk * 4 + kk] * B[vj * 4 + jj, vk * 4 + kk] + ) + + +@tvm.script.tir +def elementwise_with_root(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + with tir.block([]) as []: + with tir.block([128, 128]) as [vi, vj]: + B[vi, vj] = A[vi, vj] + tir.float32(1) + + with tir.block([128, 128]) as [vi, vj]: + C[vi, vj] = B[vi, vj] + tir.float32(1) + + +def func_with_opaque_block(a: ty.handle, b: ty.handle, c: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128]) + B = tir.match_buffer(b, [128, 128]) + C = tir.match_buffer(c, [128, 128]) + + with tir.block([]) as []: + with tir.block([]) as []: + B[0, 0] = A[0, 0] + tir.float32(1) + + with tir.block([128, 128]) as [vi, vj]: + C[vi, vj] = B[vi, vj] + tir.float32(1) + + +def test_complete_matmul(): + func = matmul + A, B, C = [func.buffer_map[x] for x in func.params] + + block = func.body.block.body.body.body.body.block + assert isinstance(block, tvm.tir.Block) + vi, vj, vk = [x.var for x in block.iter_vars] + access_A = tir.BufferRegion(A, [Range.from_min_extent(vi, 1), Range.from_min_extent(vk, 1)]) + access_B = tir.BufferRegion(B, [Range.from_min_extent(vj, 1), Range.from_min_extent(vk, 1)]) + access_C = tir.BufferRegion(C, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)]) + tvm.ir.assert_structural_equal(block.reads, [access_C, access_A, access_B]) + tvm.ir.assert_structural_equal(block.writes, [access_C]) + + +def test_complete_matmul_original(): + func = matmul_original + A, B, C = [func.buffer_map[x] for x in func.params] + + block1 = func.body.block.body.body.body[0].block + assert isinstance(block1, tvm.tir.Block) + vi, vj = [x.var for x in block1.iter_vars] + access_C = tir.BufferRegion( + C, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vj * 4, 4)] + ) + tvm.ir.assert_structural_equal(block1.reads, []) + tvm.ir.assert_structural_equal(block1.writes, [access_C]) + + block2 = func.body.block.body.body.body[1].body.block + assert isinstance(block2, tvm.tir.Block) + vi, vj, vk = [x.var for x in block2.iter_vars] + access_A = tir.BufferRegion( + A, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vk * 4, 4)] + ) + access_B = tir.BufferRegion( + B, [Range.from_min_extent(vj * 4, 4), Range.from_min_extent(vk * 4, 4)] + ) + access_C = tir.BufferRegion( + C, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vj * 4, 4)] + ) + tvm.ir.assert_structural_equal(block2.reads, [access_C, access_A, access_B]) + tvm.ir.assert_structural_equal(block2.writes, [access_C]) + + +def test_complete_with_root(): + func = elementwise_with_root + A, B, C = [func.buffer_map[x] for x in func.params] + + block1 = func.body.block.body[0].body.body.block + assert isinstance(block1, tvm.tir.Block) + vi, vj = [x.var for x in block1.iter_vars] + + tvm.ir.assert_structural_equal( + block1.reads, + [tir.BufferRegion(A, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])], + ) + tvm.ir.assert_structural_equal( + block1.writes, + [tir.BufferRegion(B, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])], + ) + + block2 = func.body.block.body[1].body.body.block + assert isinstance(block2, tvm.tir.Block) + vi, vj = [x.var for x in block2.iter_vars] + tvm.ir.assert_structural_equal( + block2.reads, + [tir.BufferRegion(B, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])], + ) + tvm.ir.assert_structural_equal( + block2.writes, + [tir.BufferRegion(C, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])], + ) + + +def test_complete_opaque_block_error(): + def render(e): + pass + + override_renderer(render) + + try: + from_source(func_with_opaque_block) + except tvm.error.DiagnosticError: + return + assert False + + +if __name__ == "__main__": + test_complete_matmul() + test_complete_matmul_original() + test_complete_with_root() + test_complete_opaque_block_error() diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index a295908afa6a8..bd36b79d7f4eb 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -2819,10 +2819,13 @@ def test_block_elements(): tvm.ir.assert_structural_equal(func, rt_func) assert isinstance(rt_func.body.block, tir.stmt.Block) - assert isinstance(rt_func.body.block.body, tir.stmt.BufferStore) - assert isinstance(rt_func.body.block.init, tir.stmt.BufferStore) - assert len(rt_func.body.block.annotations) == 1 - assert rt_func.body.block.annotations["attr_key"] == "attr_value" + assert isinstance(rt_func.body.block.body, tir.stmt.BlockRealize) + assert isinstance(rt_func.body.block.body.block, tir.stmt.Block) + block = rt_func.body.block.body.block + assert isinstance(block.body, tir.stmt.BufferStore) + assert isinstance(block.init, tir.stmt.BufferStore) + assert len(block.annotations) == 1 + assert block.annotations["attr_key"] == "attr_value" if __name__ == "__main__": From 7130e80204ff727c4947dbb928e0330b0f1d6117 Mon Sep 17 00:00:00 2001 From: Wheest Date: Thu, 25 Mar 2021 03:01:01 +0000 Subject: [PATCH 67/84] Better grouped convolution for CPU targets (#6137) * integrated with v0.8 * Rebase, and undoing accidental removal of auto scheduler NHWC support * Added ASF license header * Minor bug fixes * Added asymmetric padding support Fixed linting * Improve linting * Better linting, disable final linting checks * Fixed final linting errors (figured out how to run lint tests locally) * fixing linter formatting part 1 * fixing linter formatting part 2 * fixing linter formatting part 3 * Update conv2d.py Fixed merge issue * Rebase, and update responding to some comments * Fixed AutoScheduler bug for NHWC case * removed infer_pad from GSPC * Rebase, and undoing accidental removal of auto scheduler NHWC support * Added ASF license header * Minor bug fixes * Added asymmetric padding support Fixed linting * Improve linting * Better linting, disable final linting checks * Fixed final linting errors (figured out how to run lint tests locally) * Update conv2d.py Fixed merge issue * Rebase, and update responding to some comments * Fixed AutoScheduler bug for NHWC case * Minor fix * Fixed removal of infer_pad to no padding * Fixed unexpected linting error Co-authored-by: Perry Gibson --- python/tvm/relay/op/strategy/arm_cpu.py | 7 +- python/tvm/relay/op/strategy/x86.py | 8 +- python/tvm/topi/arm_cpu/__init__.py | 1 + python/tvm/topi/arm_cpu/group_conv2d.py | 370 +++++++++++++++++++++++ python/tvm/topi/x86/__init__.py | 1 + python/tvm/topi/x86/group_conv2d.py | 371 ++++++++++++++++++++++++ 6 files changed, 749 insertions(+), 9 deletions(-) create mode 100644 python/tvm/topi/arm_cpu/group_conv2d.py create mode 100644 python/tvm/topi/x86/group_conv2d.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 985124e305ee4..005eae68b8b7a 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -207,11 +207,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" - logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.") strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), - name="group_conv2d_nchw.generic", + wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.arm_cpu", ) elif layout == "NHWC": assert kernel_layout == "HWIO" diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 1f37a4f8e98c0..60bd92ef63d1c 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -205,12 +205,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" - if not is_auto_scheduler_enabled(): - logger.warning("group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( - wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), - name="group_conv2d_nchw.generic", + wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.x86", ) elif layout == "NHWC": assert kernel_layout == "HWIO" diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py index e121fbc7ec6d0..9e2057a7126fb 100644 --- a/python/tvm/topi/arm_cpu/__init__.py +++ b/python/tvm/topi/arm_cpu/__init__.py @@ -26,3 +26,4 @@ from .bitserial_dense import * from .injective import * from . import cortex_m7 +from .group_conv2d import * diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py new file mode 100644 index 0000000000000..d852b9acef660 --- /dev/null +++ b/python/tvm/topi/arm_cpu/group_conv2d.py @@ -0,0 +1,370 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM""" + +import tvm +from tvm import autotvm +from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + +from ..utils import get_const_tuple +from ..nn.pad import pad +from .. import tag + +from ..nn.conv2d import _get_workload as _get_conv2d_workload + + +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): + """Compute group_conv2d with NCHW layout""" + return group_conv2d_nchw_spatial_pack( + data, kernel, strides, padding, dilation, groups, out_dtype + ) + + +def schedule_group_conv2d_nchw(outs): + """Compute group_conv2d with NCHW layout""" + return schedule_group_conv2d_nchwc(outs) + + +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"): + """ + Get default schedule config for the workload + """ + static_data_shape = [] + for dim in get_const_tuple(data.shape): + if isinstance(dim, tvm.tir.Var): + static_data_shape.append(1) + else: + static_data_shape.append(dim) + data = te.placeholder(static_data_shape, dtype=data.dtype) + + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) + _fallback_schedule(cfg, wkl) + + +def _fallback_schedule(cfg, wkl): + simd_width = 4 # assume ARM SIMD Width is 4 + pad_left, pad_right = wkl.padl, wkl.padr + stride_w = wkl.stride_w + out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups + + oc_bn = 1 + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if kernels_per_group % bn == 0: + oc_bn = bn + break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if kernel_depth % bn == 0: + ic_bn = bn + break + if ic_bn > kernel_depth: + ic_bn = kernel_depth + + reg_n = 1 + for n in range(31, 0, -1): + if out_width % n == 0: + reg_n = n + break + + cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) + cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) + cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) + cfg["unroll_kw"] = OtherOptionEntity(False) + + +@autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu") +def group_conv2d_nchw_spatial_pack( + cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" +): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(dilation, int): + dilation_h, dilation_w = dilation, dilation + else: + dilation_h, dilation_w = dilation + + assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 + if isinstance(padding, int): + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + elif len(padding) == 2: + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad + else: + pad_top, pad_left, pad_bottom, pad_right = padding + + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right + + assert isinstance(strides, int) or len(strides) == 2 + if isinstance(strides, int): + stride_h, stride_w = strides, strides + else: + stride_h, stride_w = strides + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right + + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + + kernels_per_group = out_channel // groups + + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_knob("unroll_kw", [True, False]) + + # If no config was set, we can fallback to default config. + if cfg.is_fallback: + _get_default_config( + cfg, + te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder( + (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype + ), + strides, + padding, + groups, + out_dtype, + ) + + oc_bn = cfg["tile_oc"].size[-1] + ic_bn = cfg["tile_ic"].size[-1] + + # pack data + DOPAD = hpad != 0 or wpad != 0 + if DOPAD: + data_pad = pad( + data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad" + ) + else: + data_pad = data + + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) + + data_vec = te.compute( + shape, + lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name="data_vec", + ) + + # pack kernel + shape = ( + groups, + kernels_per_group // oc_bn, + kernel_depth // ic_bn, + k_height, + k_width, + ic_bn, + oc_bn, + ) + + kernel_vec = te.compute( + shape, + lambda g, out_channel, in_channel, h, w, ci, co: kernel[ + (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w + ], + name="kernel_vec", + ) + + # convolution + oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) + + ic = te.reduce_axis((0, (kernel_depth)), name="ic") + kh = te.reduce_axis((0, k_height), name="kh") + kw = te.reduce_axis((0, k_width), name="kw") + + idxmod = tvm.tir.indexmod + idxdiv = tvm.tir.indexdiv + + conv = te.compute( + oshape, + lambda g, n, oc_chunk, oh, ow, oc_block: te.sum( + data_vec[ + g, + n, + idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w, + ].astype(out_dtype) + * kernel_vec[ + g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block + ].astype(out_dtype), + axis=[ic, kh, kw], + ), + name="conv", + ) + + unpack = te.compute( + unpack_shape, + lambda n, c, h, w: conv[ + idxdiv(c, kernels_per_group), + n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, + w, + idxmod(idxmod(c, oc_bn), kernels_per_group), + ].astype(out_dtype), + name="output_unpack", + tag="group_conv2d_nchw", + ) + + return unpack + + +@autotvm.register_topi_schedule("group_conv2d_nchw.arm_cpu") +def schedule_group_conv2d_nchwc(cfg, outs): + """Create schedule for tensors""" + s = te.create_schedule([x.op for x in outs]) + scheduled_ops = [] + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: + traverse(tensor.op) + + if "group_conv2d_nchw" in op.tag: + output = op.output(0) + + if "tile_ic" not in cfg: + return + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] + _schedule_gspc_nchw(*args) + + scheduled_ops.append(op) + + traverse(outs[0].op) + return s + + +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): + """Schedule GSPC""" + ic_bn, oc_bn, reg_n, unroll_kw = ( + cfg["tile_ic"].size[-1], + cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], + cfg["unroll_kw"].val, + ) + + _, W = data, kernel_vec + A0, A1 = data_pad, data_vec + + # schedule data + if ( + data_pad is not None + and isinstance(data_pad.op, tvm.te.ComputeOp) + and "pad" in data_pad.op.tag + ): + s[A0].compute_inline() + + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + + parallel_axis = s[A1].fuse(batch, ic_chunk, ih) + s[A1].parallel(parallel_axis) + + # schedule kernel pack + groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + + if oc_bn > 1: + s[W].vectorize(oc_block) + + parallel_axis = s[W].fuse(groups, oc_chunk, oh) + s[W].parallel(parallel_axis) + + # schedule conv + C, O0, O = conv_out, output, last + CC = s.cache_write(C, "global") + + _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis + + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) + + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + + groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis + + ic, kh, kw = s[CC].op.reduce_axis + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) + + if unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) + + parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) + + s[CC].unroll(ow_block) + + if O0 != O: + s[O0].compute_inline() + + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + + s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + return s diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py index bb6a7cdd4122e..d1bd58dd48317 100644 --- a/python/tvm/topi/x86/__init__.py +++ b/python/tvm/topi/x86/__init__.py @@ -41,3 +41,4 @@ from .conv2d_alter_op import * from .dense_alter_op import * from .scatter import * +from .group_conv2d import * diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py new file mode 100644 index 0000000000000..0501c5534cf2e --- /dev/null +++ b/python/tvm/topi/x86/group_conv2d.py @@ -0,0 +1,371 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel +"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86""" + +import tvm +from tvm import autotvm +from tvm import te +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity + +from .utils import get_fp32_len +from ..utils import get_const_tuple +from ..nn.pad import pad +from .. import tag + +from ..nn.conv2d import _get_workload as _get_conv2d_workload + + +def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype): + """Compute group_conv2d with NCHW layout""" + return group_conv2d_nchw_spatial_pack( + data, kernel, strides, padding, dilation, groups, out_dtype + ) + + +def schedule_group_conv2d_nchw(outs): + """Compute group_conv2d with NCHW layout""" + return schedule_group_conv2d_nchwc(outs) + + +def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"): + """ + Get default schedule config for the workload + """ + static_data_shape = [] + for dim in get_const_tuple(data.shape): + if isinstance(dim, tvm.tir.Var): + static_data_shape.append(1) + else: + static_data_shape.append(dim) + data = te.placeholder(static_data_shape, dtype=data.dtype) + + wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) + _fallback_schedule(cfg, wkl) + + +def _fallback_schedule(cfg, wkl): + simd_width = get_fp32_len() + pad_left, pad_right = wkl.padl, wkl.padr + stride_w = wkl.stride_w + out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1 + groups = wkl.groups + kernels_per_group = wkl.out_filter // groups + kernel_depth = wkl.in_filter // groups + + oc_bn = 1 + + oc_bn = 1 + for bn in range(simd_width, 0, -1): + if kernels_per_group % bn == 0: + oc_bn = bn + break + if oc_bn > kernels_per_group: + oc_bn = kernels_per_group + + ic_bn = 1 + for bn in range(oc_bn, 0, -1): + if kernel_depth % bn == 0: + ic_bn = bn + break + if ic_bn > kernel_depth: + ic_bn = kernel_depth + + reg_n = 1 + for n in range(31, 0, -1): + if out_width % n == 0: + reg_n = n + break + + cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn]) + cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) + cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) + cfg["unroll_kw"] = OtherOptionEntity(False) + + +@autotvm.register_topi_compute("group_conv2d_nchw.x86") +def group_conv2d_nchw_spatial_pack( + cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32" +): + """ + Compute group conv2d with NCHW layout, using GSPC algorithm. + https://arxiv.org/abs/2006.09791 + """ + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(dilation, int): + dilation_h, dilation_w = dilation, dilation + else: + dilation_h, dilation_w = dilation + + assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4 + if isinstance(padding, int): + pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding + elif len(padding) == 2: + hpad, wpad = padding + pad_top, pad_bottom = hpad, hpad + pad_left, pad_right = wpad, wpad + else: + pad_top, pad_left, pad_bottom, pad_right = padding + + hpad = pad_top + pad_bottom + wpad = pad_left + pad_right + + assert isinstance(strides, int) or len(strides) == 2 + if isinstance(strides, int): + stride_h, stride_w = strides, strides + else: + stride_h, stride_w = strides + + batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape) + + pad_height = in_height + pad_top + pad_bottom + pad_width = in_width + pad_left + pad_right + + dilated_kernel_h = (k_height - 1) * dilation_h + 1 + dilated_kernel_w = (k_width - 1) * dilation_w + 1 + out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1 + out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1 + + kernels_per_group = out_channel // groups + + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + cfg.define_knob("unroll_kw", [True, False]) + + # If no config was set, we can fallback to default config. + if cfg.is_fallback: + _get_default_config( + cfg, + te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype), + te.placeholder( + (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype + ), + strides, + padding, + groups, + out_dtype, + ) + + oc_bn = cfg["tile_oc"].size[-1] + ic_bn = cfg["tile_ic"].size[-1] + + # pack data + DOPAD = hpad != 0 or wpad != 0 + if DOPAD: + data_pad = pad( + data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad" + ) + else: + data_pad = data + + shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width) + + data_vec = te.compute( + shape, + lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w], + name="data_vec", + ) + + # pack kernel + shape = ( + groups, + kernels_per_group // oc_bn, + kernel_depth // ic_bn, + k_height, + k_width, + ic_bn, + oc_bn, + ) + + kernel_vec = te.compute( + shape, + lambda g, out_channel, in_channel, h, w, ci, co: kernel[ + (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w + ], + name="kernel_vec", + ) + + # convolution + oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn) + unpack_shape = (batch_size, out_channel, out_height, out_width) + + ic = te.reduce_axis((0, (kernel_depth)), name="ic") + kh = te.reduce_axis((0, k_height), name="kh") + kw = te.reduce_axis((0, k_width), name="kw") + + idxmod = tvm.tir.indexmod + idxdiv = tvm.tir.indexdiv + conv = te.compute( + oshape, + lambda g, n, oc_chunk, oh, ow, oc_block: te.sum( + data_vec[ + g, + n, + idxdiv(ic, ic_bn), + oh * stride_h + kh * dilation_h, + idxmod(ic, ic_bn), + ow * stride_w + kw * dilation_w, + ].astype(out_dtype) + * kernel_vec[ + g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block + ].astype(out_dtype), + axis=[ic, kh, kw], + ), + name="conv", + ) + + unpack = te.compute( + unpack_shape, + lambda n, c, h, w: conv[ + idxdiv(c, kernels_per_group), + n, + idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), + h, + w, + idxmod(idxmod(c, oc_bn), kernels_per_group), + ].astype(out_dtype), + name="output_unpack", + tag="group_conv2d_nchw", + ) + + return unpack + + +@autotvm.register_topi_schedule("group_conv2d_nchw.x86") +def schedule_group_conv2d_nchwc(cfg, outs): + """Create schedule for tensors""" + s = te.create_schedule([x.op for x in outs]) + scheduled_ops = [] + + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops: + traverse(tensor.op) + + if "group_conv2d_nchw" in op.tag: + output = op.output(0) + + if "tile_ic" not in cfg: + return + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel = kernel_vec.op.input_tensors[0] + if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + data_vec = conv_out.op.input_tensors[0] + data = data_vec.op.input_tensors[0] + data_pad = None + if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: + data_pad = data + data = data_pad.op.input_tensors[0] + + args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] + _schedule_gspc_nchw(*args) + + scheduled_ops.append(op) + + traverse(outs[0].op) + return s + + +def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): + """Schedule GSPC""" + ic_bn, oc_bn, reg_n, unroll_kw = ( + cfg["tile_ic"].size[-1], + cfg["tile_oc"].size[-1], + cfg["tile_ow"].size[-1], + cfg["unroll_kw"].val, + ) + + _, W = data, kernel_vec + A0, A1 = data_pad, data_vec + + # schedule data + if ( + data_pad is not None + and isinstance(data_pad.op, tvm.te.ComputeOp) + and "pad" in data_pad.op.tag + ): + s[A0].compute_inline() + + groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis + + parallel_axis = s[A1].fuse(batch, ic_chunk, ih) + s[A1].parallel(parallel_axis) + + # schedule kernel pack + groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis + s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + + if oc_bn > 1: + s[W].vectorize(oc_block) + + parallel_axis = s[W].fuse(groups, oc_chunk, oh) + s[W].parallel(parallel_axis) + + # schedule conv + C, O0, O = conv_out, output, last + CC = s.cache_write(C, "global") + + _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis + + ow_chunk, ow_block = s[C].split(ow, factor=reg_n) + + s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + s[C].fuse(oc_chunk, oh) + s[C].vectorize(oc_block) + + groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis + + ic, kh, kw = s[CC].op.reduce_axis + ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) + ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) + + if unroll_kw: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) + s[CC].unroll(kw) + else: + s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) + + parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh) + + s[CC].parallel(parallel_axis) + + s[CC].vectorize(oc_block) + + s[CC].unroll(ow_block) + + if O0 != O: + s[O0].compute_inline() + + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + + s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + return s From 8e23806d2d522b71979d0a2730b38cc5c3bf6185 Mon Sep 17 00:00:00 2001 From: AndrewZhaoLuo Date: Wed, 24 Mar 2021 21:25:18 -0700 Subject: [PATCH 68/84] [Topi, Relay] Add cumprod (#7722) * make cumbinop, refactor cumsum, add cumprod * cumsum exclusive test * Add cumprod + flesh out cumsum tests add cumprod and tests reinstate tests rethink * add rudimentary scan implementation * add attributes of cumprod node * add cumprod strategy * add cuda strategy * python relay node construction * change attrs to be reusuable * add cumprod nodes * complete tests * Fix some typos about sum --> prod typos fix sum -> prod more typos more typo fixes more typos add doc strings * Use Bool instead of int to represent exclusive make exclusive a bool up and down stack fix x fix bool err it is a bool now fix fix thing formatting to pass linter lint python cumprod pylint fix attribute fix ordering add exclusivity tests for end to end fix things cuda identity_value * Overall improve formatting, add doc message corrections simplify construction clang-format more tests undo simpler construction due to function passing stuff fix docs more exclusive doc changes more fixins" * merge cumsum and cumprod to scan, merge tests fix stuff * remove other mentions of cumbinop -> scanop * lint formatting Co-authored-by: Andrew Zhao Luo --- include/tvm/relay/attrs/transform.h | 14 +- python/tvm/relay/op/_transform.py | 19 +- python/tvm/relay/op/strategy/cuda.py | 19 +- python/tvm/relay/op/strategy/generic.py | 29 ++- python/tvm/relay/op/transform.py | 65 ++++- python/tvm/topi/__init__.py | 2 +- python/tvm/topi/cuda/scan.py | 196 +++++++++++++-- python/tvm/topi/cumsum.py | 121 ---------- python/tvm/topi/scan.py | 236 +++++++++++++++++++ python/tvm/topi/unique.py | 2 +- src/relay/op/tensor/transform.cc | 34 ++- tests/python/relay/test_op_level3.py | 77 ++++-- tests/python/topi/python/test_topi_cumsum.py | 79 ------- tests/python/topi/python/test_topi_scan.py | 144 +++++++++++ 14 files changed, 758 insertions(+), 279 deletions(-) delete mode 100644 python/tvm/topi/cumsum.py create mode 100644 python/tvm/topi/scan.py delete mode 100644 tests/python/topi/python/test_topi_cumsum.py create mode 100644 tests/python/topi/python/test_topi_scan.py diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index ff344f5e1a85e..a5544c8a87998 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -438,17 +438,19 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode { } }; // struct MatrixSetDiagAttrs -/*! \brief Attributes used in cumsum operator */ -struct CumsumAttrs : public tvm::AttrsNode { +/*! \brief Attributes used in cumsum and cumprod operator */ +struct ScanopAttrs : public tvm::AttrsNode { Integer axis; DataType dtype; - Integer exclusive; - TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") { - TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue()); + Bool exclusive = Bool(false); + TVM_DECLARE_ATTRS(ScanopAttrs, "relay.attrs.ScanopAttrs") { + TVM_ATTR_FIELD(axis).describe("The axis to operate over").set_default(NullValue()); TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue()); + + // Default is 0 which is "false" TVM_ATTR_FIELD(exclusive) .describe("The first element is not included") - .set_default(NullValue()); + .set_default(Bool(false)); } }; diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index e90263d794bce..16262833d1bf9 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -19,16 +19,17 @@ # pylint: disable=too-many-local-variables, too-many-arguments, no-else-return from __future__ import absolute_import + import tvm -from tvm import te -from tvm.te.hybrid import script +from tvm import te, topi from tvm.runtime import convert -from tvm import topi +from tvm.te.hybrid import script from tvm.topi.utils import get_const_int, get_const_tuple + from . import op as _reg from . import strategy -from .op import OpPattern from ._tensor import elemwise_shape_func +from .op import OpPattern _reg.register_broadcast_schedule("broadcast_to") _reg.register_broadcast_schedule("broadcast_to_like") @@ -159,6 +160,16 @@ def compute_cumsum(attrs, inputs, output_type): _reg.register_strategy("cumsum", strategy.cumsum_strategy) _reg.register_shape_func("cumsum", False, elemwise_shape_func) +# cumprod +@_reg.register_compute("cumprod") +def compute_cumprod(attrs, inputs, output_type): + """Compute definition of cumprod""" + return [topi.cumprod(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] + + +_reg.register_strategy("cumprod", strategy.cumprod_strategy) +_reg.register_shape_func("cumprod", False, elemwise_shape_func) + @_reg.register_compute("unique") def compute_unique(attrs, inputs, output_type): diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index e0d0f165219e7..1a67425266077 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -18,11 +18,12 @@ # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import from tvm import topi from tvm.auto_scheduler import is_auto_scheduler_enabled -from tvm.te import SpecializedCondition from tvm.contrib import nvcc from tvm.contrib.thrust import can_use_thrust -from .generic import * +from tvm.te import SpecializedCondition + from .. import op as _op +from .generic import * @schedule_injective.register(["cuda", "gpu"]) @@ -1017,13 +1018,25 @@ def cumsum_strategy_cuda(attrs, inputs, out_type, target): """cumsum cuda strategy""" strategy = _op.OpStrategy() strategy.add_implementation( - wrap_compute_cumsum(topi.cuda.cumsum), + wrap_compute_scanop(topi.cuda.cumsum), wrap_topi_schedule(topi.cuda.schedule_scan), name="cumsum.cuda", ) return strategy +@cumprod_strategy.register(["cuda", "gpu"]) +def cumprod_strategy_cuda(attrs, inputs, out_type, target): + """cumprod cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_scanop(topi.cuda.cumprod), + wrap_topi_schedule(topi.cuda.schedule_scan), + name="cumprod.cuda", + ) + return strategy + + @unique_strategy.register(["cuda", "gpu"]) def unique_strategy_cuda(attrs, inputs, out_type, target): """unique cuda strategy""" diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 04f25640574a5..322a3607904f2 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -17,11 +17,12 @@ """Definition of generic operator strategy.""" # pylint: disable=invalid-name,unused-argument import logging - import re -from tvm import topi, _ffi, te, ir -from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple + +from tvm import _ffi, ir, te, topi from tvm.target import generic_func, override_native_generic_func +from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple + from .. import op as _op logger = logging.getLogger("strategy") @@ -1463,13 +1464,13 @@ def threefry_split_strategy(attrs, inputs, out_type, target): return strategy -def wrap_compute_cumsum(topi_compute): - """Wrap cumsum topi compute""" +def wrap_compute_scanop(topi_compute): + """Wrap scanop style topi compute""" - def _compute_cumsum(attrs, inputs, _): + def _compute_scanop(attrs, inputs, _): return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] - return _compute_cumsum + return _compute_scanop @override_native_generic_func("cumsum_strategy") @@ -1477,13 +1478,25 @@ def cumsum_strategy(attrs, inputs, out_type, target): """cumsum generic strategy""" strategy = _op.OpStrategy() strategy.add_implementation( - wrap_compute_cumsum(topi.cumsum), + wrap_compute_scanop(topi.cumsum), wrap_topi_schedule(topi.generic.schedule_extern), name="cumsum.generic", ) return strategy +@override_native_generic_func("cumprod_strategy") +def cumprod_strategy(attrs, inputs, out_type, target): + """cumprod generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_scanop(topi.cumprod), + wrap_topi_schedule(topi.generic.schedule_extern), + name="cumprod.generic", + ) + return strategy + + def wrap_compute_unique(topi_compute): """Wrap unique topi compute""" diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index df0ae767460a9..f94a00db2fb12 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -18,11 +18,11 @@ # pylint: disable=import-outside-toplevel """Transform operators.""" +from ...tir import expr as _expr +from ..expr import Constant, Expr, Tuple, TupleWrapper, const from . import _make from .dyn import _make as _dyn_make from .tensor import shape_of -from ..expr import TupleWrapper, const, Constant, Expr, Tuple -from ...tir import expr as _expr def cast(data, dtype): @@ -1539,9 +1539,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None): Type of the returned array and of the accumulator in which the elements are summed. If dtype is not specified, it defaults to the dtype of data. - exclusive : int, optional - If set to 1 will return exclusive sum in which the first element is not - included. In other terms, if set to 1, the j-th output element would be + exclusive : bool, optional + If true will return exclusive sum in which the first element is not + included. In other terms, if true, the j-th output element would be the sum of the first (j-1) elements. Otherwise, it would be the sum of the first j elements. @@ -1577,6 +1577,61 @@ def cumsum(data, axis=None, dtype=None, exclusive=None): return _make.cumsum(data, axis, dtype, exclusive) +def cumprod(data, axis=None, dtype=None, exclusive=None): + """Numpy style cumprod op. Return the cumulative inclusive product of the elements along + a given axis. + + Parameters + ---------- + data : relay.Expr + The input data to the operator. + + axis : int, optional + Axis along which the cumulative product is computed. The default (None) is to compute + the cumprod over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are multiplied. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If true will return exclusive product in which the first element is not + included. In other terms, if true, the j-th output element would be + the product of the first (j-1) elements. Otherwise, it would be the product of + the first j elements. The product of zero elements will be 1. + + Returns + ------- + result : relay.Expr + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + + Examples + -------- + .. code-block:: python + a = [[1,2,3], [4,5,6]] + + cumprod(a) # if axis is not provided, cumprod is done over the flattened input. + -> [ 1, 2, 6, 24, 120, 720] + + cumprod(a, dtype="float32") + -> [ 1., 2., 6., 24., 120., 720.] + + cumprod(a, axis=0) # multiply over rows for each of the 3 columns + -> [[1, 2, 3], + [4, 10, 18]] + + cumprod(a, axis=1) + -> [[ 1, 2, 6], + [ 4, 20, 120]] + + a = [1, 1, 1, 0, 1, 1, 0] # a is a boolean array + cumprod(a, dtype=int32) # dtype should be provided to get the expected results + -> [1, 1, 1, 0, 0, 0, 0] + """ + return _make.cumprod(data, axis, dtype, exclusive) + + def unique(data, is_sorted=True, return_counts=False): """ Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index c196b33cf880c..ef2c5c1ea4b52 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -42,7 +42,7 @@ from .sparse_reshape import * from .scatter_add import * from .argwhere import * -from .cumsum import * +from .scan import * from .einsum import * from .unique import * from . import generic diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py index 84ab5dcf97567..3240ebcd515c8 100644 --- a/python/tvm/topi/cuda/scan.py +++ b/python/tvm/topi/cuda/scan.py @@ -16,13 +16,16 @@ # under the License. # pylint: disable=invalid-name, too-many-locals, too-many-statements "Scan related operators" +from typing import Callable, Optional, Union + import tvm from tvm import te -from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust -from ..transform import expand_dims, squeeze, transpose, reshape -from ..utils import ceil_div, swap, prod, get_const_int -from ..math import cast +from tvm.contrib.thrust import can_use_rocthrust, can_use_thrust + from .. import tag +from ..math import cast +from ..transform import expand_dims, reshape, squeeze, transpose +from ..utils import ceil_div, get_const_int, prod, swap from .injective import schedule_injective_from_existing @@ -32,7 +35,7 @@ def _get_thrust_func_name(tvmop): return tvmop_to_thrust_func_name[tvmop] -def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add): +def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, identity_value=0): """Low level IR to do exclusive sum scan along rows of 2D input. Parameters @@ -50,6 +53,11 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add): A binary associative op to use for scan. The function takes two TIR expressions and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute prefix sum. + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. """ batch_size = prod(data.shape[:-1]) @@ -134,7 +142,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add): with ib.if_scope(bx < batch_size): if reduction is not None: reduction[bx] = output[(bx + 1) * scan_axis_size - 1] - output[(bx + 1) * scan_axis_size - 1] = cast(0, out_dtype) + output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype) with ib.for_range(0, lim, dtype="int64") as l2_width: width = 2 << (lim - l2_width - 1) @@ -309,7 +317,12 @@ def scan_thrust( def exclusive_scan( - data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add + data, + axis=-1, + return_reduction=False, + output_dtype=None, + binop=tvm.tir.generic.add, + identity_value=0, ): """Do exclusive scan on 1D or multidimensional input. @@ -335,6 +348,11 @@ def exclusive_scan( and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute prefix sum. + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + Returns ------- output : tvm.te.Tensor @@ -347,9 +365,15 @@ def exclusive_scan( def do_scan(data, output_dtype): target = tvm.target.Target.current() - if target and ( - can_use_thrust(target, "tvm.contrib.thrust.sum_scan") - or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan") + + # TODO: add support for a prod_scan + if ( + target + and binop == tvm.tir.generic.add + and ( + can_use_thrust(target, "tvm.contrib.thrust.sum_scan") + or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan") + ) ): return scan_thrust( data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop @@ -366,7 +390,9 @@ def do_scan(data, output_dtype): output, reduction = te.extern( [data.shape, data.shape[:-1]], [data], - lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop), + lambda ins, outs: exclusive_scan_ir( + ins[0], outs[0], outs[1], binop=binop, identity_value=identity_value + ), dtype=[data.dtype, output_dtype], in_buffers=[data_buf], name="exclusive_scan", @@ -376,7 +402,9 @@ def do_scan(data, output_dtype): output = te.extern( [data.shape], [data], - lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop), + lambda ins, outs: exclusive_scan_ir( + ins[0], outs[0], binop=binop, identity_value=identity_value + ), dtype=[output_dtype], in_buffers=[data_buf], out_buffers=[output_buf], @@ -423,7 +451,7 @@ def do_scan(data, output_dtype): return output -def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add): +def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add, identity_value=0): """Do inclusive scan on 1D or multidimensional input. Parameters @@ -442,12 +470,19 @@ def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add): and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute prefix sum. + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + Returns ------- output : tvm.te.Tensor A N-D tensor of the same rank N as the input data. """ - ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop) + ex_scan = exclusive_scan( + data, axis, output_dtype=output_dtype, binop=binop, identity_value=identity_value + ) if output_dtype is not None and data.dtype != output_dtype and output_dtype != "": data = cast(data, output_dtype) @@ -486,7 +521,74 @@ def traverse(op): return s -def cumsum(data, axis=None, dtype=None, exclusive=None): +def scanop( + data: tvm.te.Tensor, + binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"], + identity_value: Union[float, int], + axis: Optional[int] = None, + dtype: Optional[str] = None, + exclusive: Optional[bool] = None, +) -> tvm.te.Tensor: + """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod. + + See cumprod and cumsum for an example of use. + + E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be + [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4] + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr + A binary operator which should be associative and commutative. E.g. if * is your + operator then a * (b * c) = (a * b) * c and a * b = b * a + + identity_value: int or float + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + + axis : int, optional + Axis along which the operation is computed. The default (None) is to compute + the cumulative operation over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are computed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If true will return exclusive cumulative operation in which the first element is not + included. In other terms, if true, the j-th output element would be + the cumulative operation of the first (j-1) elements. Otherwise, it would be the + cumulative operation of the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if axis is None: + axis = 0 + data = reshape(data, (prod(data.shape),)) + axis = get_const_int(axis) + if exclusive is not None and exclusive: + return exclusive_scan( + data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value + ) + return inclusive_scan( + data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value + ) + + +def cumsum( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, +) -> tvm.te.Tensor: """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. Parameters @@ -502,9 +604,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None): Type of the returned array and of the accumulator in which the elements are summed. If dtype is not specified, it defaults to the dtype of data. - exclusive : int, optional - If set to 1 will return exclusive sum in which the first element is not - included. In other terms, if set to 1, the j-th output element would be + exclusive : bool, optional + If true will return exclusive sum in which the first element is not + included. In other terms, if true, the j-th output element would be the sum of the first (j-1) elements. Otherwise, it would be the sum of the first j elements. @@ -514,10 +616,54 @@ def cumsum(data, axis=None, dtype=None, exclusive=None): The result has the same size as data, and the same shape as data if axis is not None. If axis is None, the result is a 1-d array. """ - if axis is None: - axis = 0 - data = reshape(data, (prod(data.shape),)) - axis = get_const_int(axis) - if exclusive is not None and exclusive != 0: - return exclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add) - return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add) + return scanop( + data=data, + binop=tvm.tir.generic.add, + identity_value=0, + axis=axis, + dtype=dtype, + exclusive=exclusive, + ) + + +def cumprod( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, +): + """Numpy style cumprod op. Return the cumulative product of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative product is computed. The default (None) is to compute + the cumproduct over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are multiplied. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If True, will return exclusive product in which the first element is not + included. In other terms, if True, the j-th output element would be + the product of the first (j-1) elements. Otherwise, it would be the product of + the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + return scanop( + data=data, + binop=tvm.tir.generic.multiply, + identity_value=1, + axis=axis, + dtype=dtype, + exclusive=exclusive, + ) diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py deleted file mode 100644 index 2013a352874d7..0000000000000 --- a/python/tvm/topi/cumsum.py +++ /dev/null @@ -1,121 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""Cumsum operator""" -from ..tir import decl_buffer, ir_builder -from ..te import extern -from .utils import prod, get_const_int -from .math import cast - - -def cumsum(data, axis=None, dtype=None, exclusive=None): - """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. - - Parameters - ---------- - data : tvm.te.Tensor - The input data to the operator. - - axis : int, optional - Axis along which the cumulative sum is computed. The default (None) is to compute - the cumsum over the flattened array. - - dtype : string, optional - Type of the returned array and of the accumulator in which the elements are summed. - If dtype is not specified, it defaults to the dtype of data. - - exclusive : int, optional - If set to 1 will return exclusive sum in which the first element is not - included. In other terms, if set to 1, the j-th output element would be - the sum of the first (j-1) elements. Otherwise, it would be the sum of - the first j elements. - - Returns - ------- - result : tvm.te.Tensor - The result has the same size as data, and the same shape as data if axis is not None. - If axis is None, the result is a 1-d array. - """ - if dtype is None or dtype == "": - dtype = data.dtype - - def maybe_cast(x): - if dtype != data.dtype: - return cast(x, dtype) - return x - - axis_mul_before = 1 - axis_mul_after = 1 - - if axis is None: - axis = 0 - cumsum_axis_len = prod(data.shape) - shape = (cumsum_axis_len,) - else: - if not isinstance(axis, int): - axis = get_const_int(axis) - - shape = data.shape - cumsum_axis_len = shape[axis] - - if axis < 0: - axis = len(shape) + axis - - for i, value in enumerate(shape, 0): - if i < axis: - axis_mul_before *= value - elif i > axis: - axis_mul_after *= value - - if exclusive is None: - exclusive = 0 - - def gen_ir(data_buf, out_buf): - ib = ir_builder.create() - data_buf = ib.buffer_ptr(data_buf) - out_buf = ib.buffer_ptr(out_buf) - - with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused: - i = fused // axis_mul_after - j = fused % axis_mul_after - base_idx = i * cumsum_axis_len * axis_mul_after + j - if exclusive == 0: - out_buf[base_idx] = maybe_cast(data_buf[base_idx]) - else: - out_buf[base_idx] = cast(0, dtype) - with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k: - k = _k + 1 - cur_idx = base_idx + k * axis_mul_after - prev_idx = base_idx + (k - 1) * axis_mul_after - if exclusive == 0: - out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx]) - else: - out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[prev_idx]) - - return ib.get() - - out_buf = decl_buffer(shape, dtype, "out_buf") - - return extern( - [shape], - [data], - lambda ins, outs: gen_ir(ins[0], outs[0]), - dtype=dtype, - out_buffers=[out_buf], - name="cumsum_generic", - tag="cumsum_generic", - ) diff --git a/python/tvm/topi/scan.py b/python/tvm/topi/scan.py new file mode 100644 index 0000000000000..f5796730f762e --- /dev/null +++ b/python/tvm/topi/scan.py @@ -0,0 +1,236 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Scan (cumulative binary) operators""" +from typing import Callable, Optional + +import tvm + +from ..te import extern +from ..tir import decl_buffer, generic, ir_builder +from .math import cast +from .utils import get_const_int, prod + + +def scanop( + data: tvm.te.Tensor, + binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"], + identity_value: "tvm.Expr", + op_name: str, + axis: Optional[int] = None, + dtype: Optional[str] = None, + exclusive: Optional[bool] = None, +) -> tvm.te.Tensor: + """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod. + + See cumprod and cumsum for an example of use. + + E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be + [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4] + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr + A binary operator which should be associative and commutative. E.g. if * is your + operator then a * (b * c) = (a * b) * c and a * b = b * a + + identity_value: tvm.Expr + A value for the binary operation which provides the identity property. E.g. if * is + your operator and i is the identity_value then a * i = a for all a in the domain of + your operation. + + axis : int, optional + Axis along which the operation is computed. The default (None) is to compute + the cumulative operation over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are computed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If True will return exclusive cumulative operation in which the first element is not + included. In other terms, if True, the j-th output element would be + the cumulative operation of the first (j-1) elements. Otherwise, it would be the + cumulative operation of the first j elements. The cumulative operation of zero elements + is assumed to be the identity_value. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if dtype is None or dtype == "": + dtype = data.dtype + + if exclusive is None: + exclusive = False + + def maybe_cast(x): + if dtype != data.dtype: + return cast(x, dtype) + return x + + axis_mul_before = 1 + axis_mul_after = 1 + + if axis is None: + axis = 0 + cumsum_axis_len = prod(data.shape) + shape = (cumsum_axis_len,) + else: + if not isinstance(axis, int): + axis = get_const_int(axis) + + shape = data.shape + cumsum_axis_len = shape[axis] + + if axis < 0: + axis = len(shape) + axis + + for i, value in enumerate(shape, 0): + if i < axis: + axis_mul_before *= value + elif i > axis: + axis_mul_after *= value + + def gen_ir(data_buf, out_buf): + ib = ir_builder.create() + data_buf = ib.buffer_ptr(data_buf) + out_buf = ib.buffer_ptr(out_buf) + + with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused: + i = fused // axis_mul_after + j = fused % axis_mul_after + base_idx = i * cumsum_axis_len * axis_mul_after + j + if exclusive: + out_buf[base_idx] = cast(identity_value, dtype) + else: + out_buf[base_idx] = maybe_cast(data_buf[base_idx]) + with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k: + k = _k + 1 + cur_idx = base_idx + k * axis_mul_after + prev_idx = base_idx + (k - 1) * axis_mul_after + if exclusive: + out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[prev_idx])) + else: + out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[cur_idx])) + + return ib.get() + + out_buf = decl_buffer(shape, dtype, "out_buf") + + return extern( + [shape], + [data], + lambda ins, outs: gen_ir(ins[0], outs[0]), + dtype=dtype, + out_buffers=[out_buf], + name=op_name, + tag=op_name, + ) + + +def cumsum( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, +) -> tvm.te.Tensor: + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If True, will return exclusive sum in which the first element is not + included. In other terms, if True, the j-th output element would be + the sum of the first (j-1) elements. Otherwise, it would be the sum of + the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + return scanop( + data=data, + binop=generic.add, + identity_value=0, + op_name="cumsum_generic", + axis=axis, + dtype=dtype, + exclusive=exclusive, + ) + + +def cumprod( + data: tvm.te.Tensor, + axis: Optional[int] = None, + dtype: Optional[int] = None, + exclusive: Optional[bool] = None, +) -> tvm.te.Tensor: + """Numpy style cumprod op. Return the cumulative product of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative product is computed. The default (None) is to compute + the cumproduct over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are multiplied. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : bool, optional + If True, will return exclusive product in which the first element is not + included. In other terms, if True, the j-th output element would be + the product of the first (j-1) elements. Otherwise, it would be the product of + the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + return scanop( + data=data, + binop=generic.multiply, + identity_value=1, + op_name="cumprod_generic", + axis=axis, + dtype=dtype, + exclusive=exclusive, + ) diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py index b4f27b38f65f7..e7256551d7b67 100644 --- a/python/tvm/topi/unique.py +++ b/python/tvm/topi/unique.py @@ -18,7 +18,7 @@ """Unique operator""" from tvm import te, tir from ..te import hybrid -from .cumsum import cumsum +from .scan import cumsum from .sort import sort, argsort diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index b65068bd05066..6fb9f77f99eaf 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -3772,20 +3772,20 @@ RELAY_REGISTER_OP("adv_index") .set_attr("TOpPattern", kInjective) .set_attr("FTVMCompute", AdvIndexCompute); -TVM_REGISTER_NODE_TYPE(CumsumAttrs); +TVM_REGISTER_NODE_TYPE(ScanopAttrs); -bool CumsumRel(const Array& types, int num_inputs, const Attrs& attrs, +bool ScanopRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { // types: [data, output] ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output"; const auto* data = types[0].as(); if (data == nullptr) { ICHECK(types[0].as()) - << "cumsum: expect input type to be TensorType but get " << types[0]; + << "Scanop: expect input type to be TensorType but get " << types[0]; return false; } - const auto* param = attrs.as(); + const auto* param = attrs.as(); auto dtype = param->dtype; if (dtype.is_void()) { @@ -3805,8 +3805,8 @@ bool CumsumRel(const Array& types, int num_inputs, const Attrs& attrs, return true; } -Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Integer exclusive) { - auto attrs = make_object(); +Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Bool exclusive) { + auto attrs = make_object(); attrs->dtype = dtype; attrs->axis = axis; attrs->exclusive = exclusive; @@ -3822,7 +3822,27 @@ RELAY_REGISTER_OP("cumsum") .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor.") .set_support_level(3) - .add_type_rel("Cumsum", CumsumRel) + .add_type_rel("Cumsum", ScanopRel) + .set_attr("TOpPattern", kOpaque); + +Expr MakeCumprod(Expr data, Integer axis, DataType dtype, Bool exclusive) { + auto attrs = make_object(); + attrs->dtype = dtype; + attrs->axis = axis; + attrs->exclusive = exclusive; + static const Op& op = Op::Get("cumprod"); + return Call(op, {data}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.cumprod").set_body_typed(MakeCumprod); + +RELAY_REGISTER_OP("cumprod") + .describe( + R"doc(Return the cumulative product of the elements along a given axis.)doc" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input tensor.") + .set_support_level(3) + .add_type_rel("Cumprod", ScanopRel) .set_attr("TOpPattern", kOpaque); TVM_REGISTER_NODE_TYPE(UniqueAttrs); diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index d2a5090943c35..7e443aa62ee89 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -16,16 +16,16 @@ # under the License. """ Support level3 operator test cases. """ +from typing import Callable, Optional + import numpy as np import pytest import tvm -from tvm import te -from tvm import relay +import tvm.testing +from tvm import relay, te from tvm.error import TVMError from tvm.relay import create_executor, transform from tvm.relay.testing import check_grad, run_infer_type -from typing import Optional -import tvm.testing def test_zeros_ones(): @@ -1758,12 +1758,27 @@ def verify_adv_index(data_shape, index_shapes): verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)]) -@tvm.testing.parametrize_targets -def test_cumsum(target, ctx): - def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5): +# Helper for testing binop functions +scanops_supported = {"cumsum": relay.op.cumsum, "cumprod": relay.op.cumprod} + + +def run_binop_tests( + target, ctx, binop_type: str, gt_func: Callable[..., np.array], identity_value: int +): + def assert_relay_scanop( + data_np: np.array, + np_out: np.array, + axis: int = None, + out_dtype: str = None, + rtol: float = 1e-5, + atol: float = 1e-5, + exclusive: bool = False, + ): inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype))) - out = relay.op.cumsum(inp, axis, out_dtype) + if binop_type not in scanops_supported.keys(): + raise ValueError(f"Unknown function {binop_type}. Options: {scanops_supported.keys()}") + out = scanops_supported[binop_type](inp, axis, out_dtype, exclusive=exclusive) func = relay.Function([inp], out) for kind in ["graph", "debug"]: @@ -1772,24 +1787,48 @@ def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol) data = np.array([2, 3, 0]) - verify_cumsum(data, np.cumsum(data)) - verify_cumsum(data, np.cumsum(data), out_dtype="int64") + assert_relay_scanop(data, gt_func(data)) + assert_relay_scanop(data, gt_func(data), out_dtype="int64") data = np.random.randn(10, 10) - verify_cumsum(data, np.cumsum(data)) - verify_cumsum(data, np.cumsum(data, axis=0), axis=0) - verify_cumsum(data, np.cumsum(data, axis=1), axis=1) + assert_relay_scanop(data, gt_func(data)) + assert_relay_scanop(data, gt_func(data, axis=0), axis=0) + assert_relay_scanop(data, gt_func(data, axis=1), axis=1) data = np.random.randn(10, 5, 10).astype("float32") - verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4) - verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4) - verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4) - verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4) + assert_relay_scanop(data, gt_func(data), rtol=1e-4, atol=1e-4) + assert_relay_scanop(data, gt_func(data, axis=0), axis=0, rtol=1e-4, atol=1e-4) + assert_relay_scanop(data, gt_func(data, axis=1), axis=1, rtol=1e-4, atol=1e-4) + assert_relay_scanop(data, gt_func(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4) data = np.random.rand(10) > 0.5 data = data.astype(np.int32) - verify_cumsum(data, np.cumsum(data, dtype=np.int32)) - verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64") + assert_relay_scanop(data, gt_func(data, dtype=np.int32)) + assert_relay_scanop(data, gt_func(data, dtype="int64"), out_dtype="int64") + + # Test exclusivity operations + data = np.random.randint(-100, 100, size=(10, 10)).astype("int64") + expected_result = np.roll(gt_func(data), 1) + expected_result[0] = identity_value + assert_relay_scanop(data, expected_result, exclusive=True) + + expected_result = np.roll(gt_func(data, axis=0), 1, axis=0) + expected_result[0, :] = identity_value + assert_relay_scanop(data, expected_result, exclusive=True, axis=0) + + expected_result = np.roll(gt_func(data, axis=1), 1, axis=1) + expected_result[:, 0] = identity_value + assert_relay_scanop(data, expected_result, exclusive=True, axis=1) + + +@tvm.testing.parametrize_targets +def test_cumsum(target, ctx): + run_binop_tests(target, ctx, binop_type="cumsum", gt_func=np.cumsum, identity_value=0) + + +@tvm.testing.parametrize_targets +def test_cumprod(target, ctx): + run_binop_tests(target, ctx, binop_type="cumprod", gt_func=np.cumprod, identity_value=1) @tvm.testing.parametrize_targets diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py deleted file mode 100644 index cfe5130643c52..0000000000000 --- a/tests/python/topi/python/test_topi_cumsum.py +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import numpy as np -import tvm -import tvm.testing -from tvm import topi -import tvm.topi.testing - - -@tvm.testing.parametrize_targets -def test_cumsum(ctx, target): - def check_cumsum(np_ref, data, axis=None, dtype=None): - implementations = { - "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern), - "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), - "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), - "vulkan": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), - "metal": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), - } - fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) - tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule) - - data = np.array([2, 3, 0]) - check_cumsum(np.cumsum(data), data) - - data = np.random.rand(10) > 0.5 - data = data.astype(np.int32) - check_cumsum(np.cumsum(data, dtype=np.int32), data) - check_cumsum(np.cumsum(data), data, dtype="int64") - - data = np.random.rand(10) > 0.5 - check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32") - - for in_dtype in ["float32", "float64"]: - if target == "metal" and in_dtype == "float64": - # float64 is not supported in metal - continue - data = np.random.randn(10, 10).astype(in_dtype) - check_cumsum(np.cumsum(data), data) - check_cumsum(np.cumsum(data, axis=0), data, axis=0) - check_cumsum(np.cumsum(data, axis=1), data, axis=1) - - data = np.random.randn(10, 5, 10).astype(in_dtype) - check_cumsum(np.cumsum(data), data) - check_cumsum(np.cumsum(data, axis=0), data, axis=0) - check_cumsum(np.cumsum(data, axis=1), data, axis=1) - check_cumsum(np.cumsum(data, axis=-1), data, axis=-1) - - for in_dtype in ["int32", "int64"]: - data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype) - check_cumsum(np.cumsum(data, dtype=in_dtype), data) - check_cumsum(np.cumsum(data), data, dtype="int64") - check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0) - check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1) - - data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype) - check_cumsum(np.cumsum(data), data, dtype="int64") - - -if __name__ == "__main__": - test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) - test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) - test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx")) - test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan")) - test_cumsum(tvm.context("metal"), tvm.target.Target("metal")) diff --git a/tests/python/topi/python/test_topi_scan.py b/tests/python/topi/python/test_topi_scan.py new file mode 100644 index 0000000000000..020fde51659f1 --- /dev/null +++ b/tests/python/topi/python/test_topi_scan.py @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Callable + +import numpy as np +import tvm +import tvm.testing +import tvm.topi.testing +from tvm import topi + +topi_funcs = { + "cumsum": {"generic": topi.cumsum, "cuda": topi.cuda.cumsum}, + "cumprod": {"generic": topi.cumprod, "cuda": topi.cuda.cumprod}, +} + +identity_value = {"cumsum": 0, "cumprod": 1} + + +def get_implementations(name, axis, dtype, exclusive): + topi_func_generic = topi_funcs[name]["generic"] + topi_func_cuda = topi_funcs[name]["cuda"] + + return { + "generic": ( + lambda x: topi_func_generic(x, axis, dtype, exclusive=exclusive), + topi.generic.schedule_extern, + ), + "cuda": ( + lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive), + topi.cuda.schedule_scan, + ), + "nvptx": ( + lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive), + topi.cuda.schedule_scan, + ), + "vulkan": ( + lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive), + topi.cuda.schedule_scan, + ), + "metal": ( + lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive), + topi.cuda.schedule_scan, + ), + } + + +def _run_tests( + ctx, + target, + op_name: str = "cumsum", + gt_func: Callable[..., np.array] = np.cumsum, +): + def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False): + implementations = get_implementations(op_name, axis, dtype, exclusive) + fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) + tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule) + + data = np.array([2, 3, 0]) + check_scan(gt_func(data), data) + + data = np.random.rand(10) > 0.5 + data = data.astype(np.int32) + check_scan(gt_func(data, dtype=np.int32), data) + check_scan(gt_func(data), data, dtype="int64") + + data = np.random.rand(10) > 0.5 + check_scan(gt_func(data, dtype=np.int32), data, dtype="int32") + + for in_dtype in ["float32", "float64"]: + if target == "metal" and in_dtype == "float64": + # float64 is not supported in metal + continue + data = np.random.randn(10, 10).astype(in_dtype) + check_scan(gt_func(data), data) + check_scan(gt_func(data, axis=0), data, axis=0) + check_scan(gt_func(data, axis=1), data, axis=1) + + data = np.random.randn(10, 5, 10).astype(in_dtype) + check_scan(gt_func(data), data) + check_scan(gt_func(data, axis=0), data, axis=0) + check_scan(gt_func(data, axis=1), data, axis=1) + check_scan(gt_func(data, axis=-1), data, axis=-1) + + for in_dtype in ["int32", "int64"]: + data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype) + check_scan(gt_func(data, dtype=in_dtype), data) + check_scan(gt_func(data), data, dtype="int64") + check_scan(gt_func(data, axis=0, dtype=in_dtype), data, axis=0) + check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1) + + data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype) + check_scan(gt_func(data), data, dtype="int64") + + data = np.random.randint(-100, 100, size=(100, 100)).astype("int64") + + expected_result = np.roll(gt_func(data), 1) + expected_result[0] = identity_value[op_name] + check_scan(expected_result, data, dtype="int64", exclusive=True) + + expected_result = np.roll(gt_func(data, axis=0, dtype=in_dtype), 1, axis=0) + expected_result[0, :] = identity_value[op_name] + check_scan(expected_result, data, axis=0, exclusive=True) + + expected_result = np.roll(gt_func(data, axis=1, dtype=in_dtype), 1, axis=1) + expected_result[:, 0] = identity_value[op_name] + check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1) + + +@tvm.testing.parametrize_targets +def test_cumsum(ctx, target): + _run_tests(ctx, target, op_name="cumsum", gt_func=np.cumsum) + + +@tvm.testing.parametrize_targets +def test_cumprod(ctx, target): + _run_tests(ctx, target, op_name="cumprod", gt_func=np.cumprod) + + +if __name__ == "__main__": + test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) + test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) + test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx")) + test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan")) + test_cumsum(tvm.context("metal"), tvm.target.Target("metal")) + + test_cumprod(tvm.context("cpu"), tvm.target.Target("llvm")) + test_cumprod(tvm.context("cuda"), tvm.target.Target("cuda")) + test_cumprod(tvm.context("nvptx"), tvm.target.Target("nvptx")) + test_cumprod(tvm.context("vulkan"), tvm.target.Target("vulkan")) + test_cumprod(tvm.context("metal"), tvm.target.Target("metal")) From 67a4a8bbdf6576e7a4e8df977b68eeccbb9d5e9b Mon Sep 17 00:00:00 2001 From: mvermeulen <5479696+mvermeulen@users.noreply.github.com> Date: Thu, 25 Mar 2021 03:21:50 -0500 Subject: [PATCH 69/84] Fix missing header, caused compilation failure. (#7740) --- src/runtime/contrib/miopen/conv_forward.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/runtime/contrib/miopen/conv_forward.cc b/src/runtime/contrib/miopen/conv_forward.cc index 1353e2f996bb4..e44a0650024bd 100644 --- a/src/runtime/contrib/miopen/conv_forward.cc +++ b/src/runtime/contrib/miopen/conv_forward.cc @@ -24,6 +24,8 @@ #include #include +#include + #include "miopen_utils.h" namespace tvm { From 8c8971a7e0d27ec9e7ba577d0cc3fb4f2323a46d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 26 Mar 2021 01:25:45 -0400 Subject: [PATCH 70/84] [CI] Temp disable rust docs build (#7743) --- tests/scripts/task_python_docs.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index 459b680daeb1b..1eb75be830c3b 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -74,7 +74,8 @@ cd .. # Rust doc cd rust -cargo doc --workspace --no-deps +# Temp disable rust doc build +# cargo doc --workspace --no-deps cd .. # Prepare the doc dir @@ -84,7 +85,7 @@ rm -f _docs/.buildinfo mkdir -p _docs/api mv docs/doxygen/html _docs/api/doxygen mv jvm/core/target/site/apidocs _docs/api/javadoc -mv rust/target/doc _docs/api/rust +# mv rust/target/doc _docs/api/rust mv web/dist/docs _docs/api/typedoc echo "Start creating the docs tarball.." From fbfeee4ccb8dcab9b8d18bdef7c8dab89074777a Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 26 Mar 2021 04:56:57 -0700 Subject: [PATCH 71/84] [Refactor] Rename TVMContext to Device (#7721) --- 3rdparty/dlpack | 2 +- .../Camera2BasicFragment.java | 16 +- .../apache/tvm/android/demo/MainActivity.java | 10 +- apps/android_rpc/tests/android_rpc_test.py | 24 +- apps/benchmark/arm_cpu_imagenet_bench.py | 6 +- apps/benchmark/gpu_imagenet_bench.py | 6 +- apps/benchmark/mobile_gpu_imagenet_bench.py | 6 +- apps/bundle_deploy/bundle.c | 16 +- apps/bundle_deploy/bundle_static.c | 16 +- apps/bundle_deploy/demo.cc | 4 +- apps/bundle_deploy/demo_static.c | 8 +- apps/bundle_deploy/test.cc | 4 +- apps/bundle_deploy/test_static.c | 8 +- apps/extension/tests/test_ext.py | 12 +- apps/howto_deploy/cpp_deploy.cc | 8 +- apps/ios_rpc/tests/ios_rpc_mobilenet.py | 10 +- apps/ios_rpc/tests/ios_rpc_test.py | 16 +- .../topi_recipe/conv/depthwise_conv2d_test.py | 44 +- apps/topi_recipe/conv/test_conv2d_hwcn_map.py | 10 +- apps/topi_recipe/conv/test_conv_int8_arm.py | 12 +- apps/topi_recipe/conv/test_conv_int8_intel.py | 12 +- apps/topi_recipe/gemm/android_gemm_square.py | 14 +- apps/topi_recipe/gemm/cuda_gemm_square.py | 12 +- apps/topi_recipe/gemm/gemm_int8.py | 10 +- apps/topi_recipe/reduce/test_reduce_map.py | 4 +- apps/topi_recipe/rnn/lstm.py | 14 +- apps/topi_recipe/rnn/matexp.py | 10 +- apps/wasm-standalone/wasm-graph/src/types.rs | 4 +- docs/contribute/code_guide.rst | 2 +- docs/deploy/arm_compute_lib.rst | 4 +- docs/deploy/bnns.rst | 4 +- docs/deploy/hls.rst | 8 +- docs/deploy/tensorrt.rst | 4 +- docs/deploy/vitis_ai.rst | 4 +- docs/dev/codebase_walkthrough.rst | 8 +- docs/dev/debugger.rst | 2 +- docs/dev/index.rst | 2 +- docs/dev/relay_bring_your_own_codegen.rst | 8 +- golang/sample/simple.go | 2 +- golang/src/array_test.go | 24 +- golang/src/{context.go => device.go} | 68 +- golang/src/ndarray.go | 18 +- golang/src/value.go | 4 +- include/tvm/relay/analysis.h | 8 +- include/tvm/relay/interpreter.h | 4 +- include/tvm/runtime/c_runtime_api.h | 49 +- include/tvm/runtime/crt/graph_runtime.h | 4 +- include/tvm/runtime/crt/memory.h | 8 +- include/tvm/runtime/crt/platform.h | 8 +- include/tvm/runtime/device_api.h | 131 ++-- include/tvm/runtime/ndarray.h | 66 +- include/tvm/runtime/packed_func.h | 28 +- include/tvm/runtime/profiling.h | 16 +- include/tvm/runtime/serializer.h | 16 +- include/tvm/runtime/vm/memory_manager.h | 35 +- include/tvm/runtime/vm/vm.h | 16 +- include/tvm/tir/stmt.h | 6 +- jvm/README.md | 8 +- .../main/java/org/apache/tvm/ArgTypeCode.java | 2 +- .../tvm/{TVMContext.java => Device.java} | 68 +- .../src/main/java/org/apache/tvm/LibInfo.java | 2 +- .../src/main/java/org/apache/tvm/NDArray.java | 32 +- .../org/apache/tvm/contrib/GraphModule.java | 16 +- .../org/apache/tvm/contrib/GraphRuntime.java | 12 +- .../java/org/apache/tvm/rpc/RPCSession.java | 86 +-- ...emoteContext.java => TVMRemoteDevice.java} | 6 +- .../test/java/org/apache/tvm/ModuleTest.java | 14 +- .../apache/tvm/contrib/GraphRuntimeTest.java | 18 +- jvm/native/src/main/native/jni_helper_func.h | 14 +- .../native/org_apache_tvm_native_c_api.cc | 2 +- python/tvm/__init__.py | 2 +- python/tvm/_ffi/_ctypes/packed_func.py | 10 +- python/tvm/_ffi/_ctypes/types.py | 16 +- python/tvm/_ffi/_cython/base.pxi | 10 +- python/tvm/_ffi/_cython/packed_func.pxi | 12 +- python/tvm/_ffi/runtime_ctypes.py | 16 +- python/tvm/auto_scheduler/measure.py | 26 +- python/tvm/autotvm/measure/measure_methods.py | 22 +- python/tvm/contrib/coreml_runtime.py | 12 +- .../contrib/cuda_graph/cuda_graph_runtime.py | 12 +- python/tvm/contrib/debugger/debug_runtime.py | 48 +- python/tvm/contrib/graph_runtime.py | 66 +- python/tvm/contrib/peak.py | 48 +- python/tvm/contrib/sparse.py | 16 +- python/tvm/contrib/target/coreml.py | 4 +- python/tvm/contrib/tflite_runtime.py | 16 +- python/tvm/driver/build_module.py | 4 +- python/tvm/driver/tvmc/runner.py | 12 +- python/tvm/micro/session.py | 32 +- python/tvm/relay/analysis/analysis.py | 8 +- .../relay/backend/graph_runtime_codegen.py | 2 +- python/tvm/relay/backend/interpreter.py | 10 +- python/tvm/relay/backend/vm.py | 14 +- python/tvm/relay/build_module.py | 32 +- python/tvm/relay/frontend/common.py | 6 +- python/tvm/relay/op/annotation/annotation.py | 10 +- python/tvm/relay/op/memory/memory.py | 10 +- python/tvm/relay/op/tensor.py | 20 +- python/tvm/relay/quantize/_calibrate.py | 6 +- python/tvm/relay/testing/__init__.py | 4 +- python/tvm/relay/testing/init.py | 2 +- python/tvm/relay/transform/memory_plan.py | 31 +- python/tvm/rpc/client.py | 32 +- python/tvm/runtime/__init__.py | 4 +- python/tvm/runtime/module.py | 10 +- python/tvm/runtime/ndarray.py | 127 ++-- python/tvm/runtime/profiler_vm.py | 6 +- python/tvm/runtime/vm.py | 52 +- python/tvm/testing.py | 14 +- python/tvm/tir/buffer.py | 8 +- python/tvm/topi/cuda/nms.py | 8 +- python/tvm/topi/random/kernel.py | 6 +- python/tvm/topi/sort.py | 6 +- python/tvm/topi/testing/common.py | 8 +- python/tvm/topi/vision/nms.py | 8 +- rust/tvm-graph-rt/src/array.rs | 12 +- rust/tvm-graph-rt/src/graph.rs | 14 +- rust/tvm-rt/src/{context.rs => device.rs} | 14 +- rust/tvm-rt/src/lib.rs | 12 +- rust/tvm-rt/src/ndarray.rs | 68 +- rust/tvm-sys/src/array.rs | 4 +- rust/tvm-sys/src/{context.rs => device.rs} | 112 +-- rust/tvm-sys/src/lib.rs | 4 +- rust/tvm-sys/src/packed_func.rs | 8 +- rust/tvm-sys/src/value.rs | 12 +- rust/tvm/examples/resnet/src/build_resnet.py | 6 +- rust/tvm/examples/resnet/src/main.rs | 8 +- rust/tvm/src/ir/diagnostics/mod.rs | 2 +- rust/tvm/src/lib.rs | 4 +- rust/tvm/src/runtime/graph_rt.rs | 16 +- rust/tvm/tests/basics/src/main.rs | 12 +- rust/tvm/tests/callback/src/bin/array.rs | 2 +- src/auto_scheduler/search_task.cc | 16 +- src/contrib/tf_op/tvm_dso_op_kernels.cc | 10 +- src/node/structural_hash.cc | 6 +- src/parser/parser.cc | 12 +- src/printer/relay_text_printer.cc | 2 +- src/relay/analysis/context_analysis.cc | 86 +-- src/relay/analysis/util.cc | 2 +- src/relay/backend/graph_plan_memory.cc | 2 +- src/relay/backend/interpreter.cc | 42 +- src/relay/backend/vm/compiler.cc | 2 +- src/relay/backend/vm/compiler.h | 2 +- src/relay/op/memory/memory.cc | 6 +- src/relay/op/memory/memory.h | 2 +- src/relay/transforms/fold_constant.cc | 28 +- src/relay/transforms/memory_alloc.cc | 53 +- src/relay/transforms/partial_eval.cc | 20 +- src/runtime/c_runtime_api.cc | 129 ++-- .../contrib/arm_compute_lib/acl_allocator.cc | 8 +- .../contrib/arm_compute_lib/acl_allocator.h | 8 +- src/runtime/contrib/coreml/coreml_runtime.mm | 4 +- src/runtime/contrib/cudnn/conv_forward.cc | 4 +- src/runtime/contrib/cudnn/cudnn_utils.cc | 4 +- src/runtime/contrib/cudnn/cudnn_utils.h | 2 +- .../contrib/edgetpu/edgetpu_runtime.cc | 8 +- src/runtime/contrib/edgetpu/edgetpu_runtime.h | 8 +- src/runtime/contrib/ethosn/ethosn_device.cc | 3 +- src/runtime/contrib/miopen/conv_forward.cc | 20 +- src/runtime/contrib/miopen/miopen_utils.cc | 4 +- src/runtime/contrib/miopen/miopen_utils.h | 2 +- src/runtime/contrib/mps/conv.mm | 20 +- src/runtime/contrib/mps/gemm.mm | 8 +- src/runtime/contrib/nnpack/convolution.cc | 16 +- .../contrib/random/mt_random_engine.cc | 6 +- src/runtime/contrib/random/random.cc | 2 +- .../contrib/tensorrt/tensorrt_builder.cc | 4 +- .../contrib/tensorrt/tensorrt_runtime.cc | 6 +- src/runtime/contrib/tflite/tflite_runtime.cc | 10 +- src/runtime/contrib/tflite/tflite_runtime.h | 10 +- src/runtime/cpu_device_api.cc | 25 +- src/runtime/crt/common/crt_backend_api.c | 8 +- src/runtime/crt/common/crt_runtime_api.c | 54 +- src/runtime/crt/common/ndarray.c | 32 +- src/runtime/crt/graph_runtime/graph_runtime.c | 136 ++-- src/runtime/crt/graph_runtime/load_json.c | 20 +- .../graph_runtime_module.c | 4 +- src/runtime/crt/host/main.cc | 8 +- .../tvm/runtime/crt/internal/common/ndarray.h | 4 +- .../internal/graph_runtime/graph_runtime.h | 4 +- src/runtime/crt/memory/memory.c | 6 +- src/runtime/crt/utvm_rpc_server/rpc_server.cc | 6 +- src/runtime/cuda/cuda_device_api.cc | 99 ++- .../cuda_graph/graph_runtime_cuda_graph.cc | 16 +- .../graph/debug/graph_runtime_debug.cc | 22 +- src/runtime/graph/graph_runtime.cc | 62 +- src/runtime/graph/graph_runtime.h | 11 +- src/runtime/graph/graph_runtime_factory.cc | 56 +- src/runtime/graph/graph_runtime_factory.h | 12 +- src/runtime/hexagon/hexagon_device_api.cc | 72 +- src/runtime/hexagon/hexagon_module.cc | 34 +- src/runtime/hexagon/hexagon_module.h | 2 +- src/runtime/metal/metal_common.h | 52 +- src/runtime/metal/metal_device_api.mm | 69 +- src/runtime/metal/metal_module.mm | 6 +- .../micro/standalone/utvm_graph_runtime.cc | 16 +- .../micro/standalone/utvm_graph_runtime.h | 12 +- src/runtime/minrpc/minrpc_server.h | 42 +- src/runtime/minrpc/rpc_reference.h | 16 +- src/runtime/ndarray.cc | 59 +- src/runtime/opencl/aocl/aocl_common.h | 2 +- src/runtime/opencl/aocl/aocl_device_api.cc | 4 +- src/runtime/opencl/opencl_common.h | 42 +- src/runtime/opencl/opencl_device_api.cc | 47 +- src/runtime/opencl/opencl_module.cc | 4 +- src/runtime/opencl/sdaccel/sdaccel_common.h | 2 +- .../opencl/sdaccel/sdaccel_device_api.cc | 4 +- src/runtime/profiling.cc | 20 +- src/runtime/rocm/rocm_device_api.cc | 84 +-- src/runtime/rpc/rpc_device_api.cc | 79 ++- src/runtime/rpc/rpc_endpoint.cc | 82 ++- src/runtime/rpc/rpc_endpoint.h | 4 +- src/runtime/rpc/rpc_local_session.cc | 20 +- src/runtime/rpc/rpc_local_session.h | 2 +- src/runtime/rpc/rpc_module.cc | 72 +- src/runtime/rpc/rpc_session.cc | 4 +- src/runtime/rpc/rpc_session.h | 18 +- src/runtime/stackvm/stackvm.cc | 8 +- src/runtime/vm/memory_manager.cc | 39 +- src/runtime/vm/naive_allocator.h | 10 +- src/runtime/vm/pooled_allocator.h | 12 +- src/runtime/vm/profiler/vm.cc | 9 +- src/runtime/vm/vm.cc | 80 +-- src/runtime/vulkan/vulkan.cc | 79 ++- src/runtime/workspace_pool.cc | 42 +- src/runtime/workspace_pool.h | 8 +- src/support/ffi_testing.cc | 10 +- src/target/llvm/codegen_amdgpu.cc | 12 +- src/target/llvm/codegen_hexagon.cc | 6 +- src/target/llvm/codegen_params.cc | 2 +- src/target/source/codegen_c.cc | 4 +- src/target/target_kind.cc | 2 +- ...hedule_postproc_rewrite_for_tensor_core.cc | 4 +- src/tir/transforms/lower_tvm_builtin.cc | 4 +- src/tir/transforms/make_packed_api.cc | 6 +- tests/cpp/container_test.cc | 2 +- tests/cpp/contrib/bnns.cc | 4 +- tests/cpp/packed_func_test.cc | 4 +- tests/cpp/profiling.cc | 8 +- tests/cpp/relay_build_module_test.cc | 4 +- tests/crt/memory_test.cc | 8 +- tests/micro/qemu/test_zephyr.py | 20 +- tests/micro/qemu/zephyr-runtime/src/main.c | 4 +- .../test_minimal_target_codegen_llvm.py | 14 +- .../test_runtime_ndarray.py | 8 +- .../test_runtime_packed_func.py | 12 +- .../contrib/test_bnns/test_onnx_topologies.py | 4 +- tests/python/contrib/test_cblas.py | 24 +- tests/python/contrib/test_coreml_codegen.py | 6 +- tests/python/contrib/test_coreml_runtime.py | 14 +- tests/python/contrib/test_cublas.py | 24 +- tests/python/contrib/test_cudnn.py | 28 +- tests/python/contrib/test_edgetpu_runtime.py | 6 +- .../contrib/test_ethosn/infrastructure.py | 2 +- tests/python/contrib/test_gemm_acc16.py | 10 +- tests/python/contrib/test_gemm_acc32_vnni.py | 10 +- tests/python/contrib/test_miopen.py | 10 +- tests/python/contrib/test_mps.py | 16 +- tests/python/contrib/test_mxnet_bridge.py | 8 +- tests/python/contrib/test_nnpack.py | 28 +- tests/python/contrib/test_onnx.py | 4 +- tests/python/contrib/test_onnx_model.py | 4 +- tests/python/contrib/test_random.py | 20 +- tests/python/contrib/test_rocblas.py | 16 +- tests/python/contrib/test_sort.py | 26 +- tests/python/contrib/test_sparse.py | 24 +- tests/python/contrib/test_tensorrt.py | 36 +- tests/python/contrib/test_tflite_runtime.py | 1 - tests/python/contrib/test_thrust.py | 24 +- .../contrib/test_verilator/infrastructure.py | 4 +- .../contrib/test_vitis_ai/infrastructure.py | 5 +- .../test_vitis_ai_runtime_cpu_part.py | 2 +- tests/python/frontend/caffe/test_forward.py | 4 +- tests/python/frontend/caffe2/test_forward.py | 8 +- tests/python/frontend/coreml/test_forward.py | 102 +-- tests/python/frontend/darknet/test_forward.py | 4 +- tests/python/frontend/keras/test_forward.py | 8 +- tests/python/frontend/mxnet/test_forward.py | 258 +++---- .../frontend/mxnet/test_qnn_ops_utils.py | 4 +- tests/python/frontend/onnx/test_forward.py | 77 +-- tests/python/frontend/pytorch/test_forward.py | 12 +- tests/python/frontend/pytorch/test_lstm.py | 10 +- .../frontend/pytorch/test_object_detection.py | 4 +- .../frontend/tensorflow/test_bn_dynamic.py | 4 +- .../frontend/tensorflow/test_forward.py | 20 +- tests/python/frontend/tflite/test_forward.py | 8 +- tests/python/integration/test_dot.py | 8 +- tests/python/integration/test_ewise.py | 58 +- tests/python/integration/test_ewise_fpga.py | 20 +- tests/python/integration/test_gemm.py | 12 +- tests/python/integration/test_reduce.py | 94 +-- tests/python/integration/test_scan.py | 6 +- tests/python/integration/test_tuning.py | 2 +- .../integration/test_winograd_nnpack.py | 10 +- .../test_quantization_accuracy.py | 6 +- .../python/relay/benchmarking/benchmark_vm.py | 22 +- .../relay/dyn/test_dynamic_op_level10.py | 18 +- .../relay/dyn/test_dynamic_op_level2.py | 8 +- .../relay/dyn/test_dynamic_op_level3.py | 6 +- .../relay/dyn/test_dynamic_op_level4.py | 4 +- .../relay/dyn/test_dynamic_op_level5.py | 4 +- .../relay/dyn/test_dynamic_op_level6.py | 4 +- tests/python/relay/test_adt.py | 4 +- tests/python/relay/test_any.py | 12 +- ..._auto_scheduler_layout_rewrite_networks.py | 4 +- .../relay/test_auto_scheduler_tuning.py | 4 +- .../relay/test_backend_compile_engine.py | 6 +- .../relay/test_backend_graph_runtime.py | 10 +- .../python/relay/test_backend_interpreter.py | 8 +- tests/python/relay/test_cpp_build_module.py | 28 +- tests/python/relay/test_external_codegen.py | 12 +- tests/python/relay/test_json_runtime.py | 16 +- tests/python/relay/test_op_fast_math.py | 6 +- tests/python/relay/test_op_grad_level1.py | 8 +- tests/python/relay/test_op_grad_level2.py | 12 +- tests/python/relay/test_op_grad_level3.py | 8 +- tests/python/relay/test_op_level1.py | 40 +- tests/python/relay/test_op_level10.py | 60 +- tests/python/relay/test_op_level2.py | 114 ++-- tests/python/relay/test_op_level3.py | 120 ++-- tests/python/relay/test_op_level4.py | 44 +- tests/python/relay/test_op_level5.py | 88 +-- tests/python/relay/test_op_level6.py | 12 +- tests/python/relay/test_op_qnn_add.py | 12 +- tests/python/relay/test_op_qnn_concatenate.py | 10 +- .../relay/test_op_qnn_conv2_transpose.py | 2 +- tests/python/relay/test_op_qnn_conv2d.py | 8 +- tests/python/relay/test_op_qnn_dense.py | 2 +- tests/python/relay/test_op_qnn_dequantize.py | 6 +- tests/python/relay/test_op_qnn_mul.py | 10 +- tests/python/relay/test_op_qnn_quantize.py | 6 +- tests/python/relay/test_op_qnn_requantize.py | 2 +- .../relay/test_op_qnn_simulated_dequantize.py | 2 +- .../relay/test_op_qnn_simulated_quantize.py | 2 +- tests/python/relay/test_op_qnn_subtract.py | 2 +- tests/python/relay/test_param_dict.py | 6 +- .../python/relay/test_pass_alter_op_layout.py | 6 +- .../python/relay/test_pass_annotate_target.py | 12 +- tests/python/relay/test_pass_annotation.py | 158 ++--- tests/python/relay/test_pass_auto_quantize.py | 4 +- .../relay/test_pass_dynamic_to_static.py | 8 +- .../relay/test_pass_fold_explicit_padding.py | 4 +- tests/python/relay/test_pass_fuse_ops.py | 4 +- tests/python/relay/test_pass_manager.py | 24 +- tests/python/relay/test_pass_partial_eval.py | 4 +- .../python/relay/test_pass_partition_graph.py | 18 +- .../relay/test_pass_to_a_normal_form.py | 8 +- .../test_pass_to_basic_block_normal_form.py | 8 +- .../relay/test_pass_to_graph_normal_form.py | 4 +- tests/python/relay/test_prng.py | 23 +- .../relay/test_simplify_fc_transpose.py | 4 +- .../python/relay/test_sparse_dense_convert.py | 4 +- tests/python/relay/test_tensor_array.py | 7 +- tests/python/relay/test_vm.py | 84 +-- tests/python/relay/test_vm_serialization.py | 18 +- tests/python/topi/python/test_fifo_buffer.py | 70 +- .../python/topi/python/test_topi_argwhere.py | 24 +- .../topi/python/test_topi_batch_matmul.py | 24 +- .../test_topi_batch_matmul_tensorcore.py | 8 +- .../python/test_topi_batch_to_space_nd.py | 18 +- .../topi/python/test_topi_bitserial_conv2d.py | 16 +- .../python/test_topi_bitserial_conv2d_rasp.py | 8 +- .../topi/python/test_topi_bitserial_dense.py | 8 +- tests/python/topi/python/test_topi_bnn.py | 12 +- .../python/topi/python/test_topi_broadcast.py | 114 ++-- tests/python/topi/python/test_topi_clip.py | 18 +- tests/python/topi/python/test_topi_conv1d.py | 20 +- .../python/test_topi_conv1d_transpose_ncw.py | 24 +- .../topi/python/test_topi_conv2d_NCHWc.py | 10 +- .../topi/python/test_topi_conv2d_hwcn.py | 36 +- .../test_topi_conv2d_hwnc_tensorcore.py | 26 +- .../topi/python/test_topi_conv2d_int8.py | 110 +-- .../topi/python/test_topi_conv2d_nchw.py | 40 +- .../topi/python/test_topi_conv2d_nhwc.py | 8 +- .../python/test_topi_conv2d_nhwc_pack_int8.py | 8 +- .../test_topi_conv2d_nhwc_tensorcore.py | 12 +- .../python/test_topi_conv2d_nhwc_winograd.py | 10 +- .../python/test_topi_conv2d_transpose_nchw.py | 36 +- .../topi/python/test_topi_conv2d_winograd.py | 20 +- .../topi/python/test_topi_conv3d_ncdhw.py | 26 +- .../topi/python/test_topi_conv3d_ndhwc.py | 22 +- .../test_topi_conv3d_ndhwc_tensorcore.py | 10 +- .../test_topi_conv3d_transpose_ncdhw.py | 24 +- .../topi/python/test_topi_conv3d_winograd.py | 10 +- .../topi/python/test_topi_correlation.py | 20 +- .../python/test_topi_deformable_conv2d.py | 20 +- tests/python/topi/python/test_topi_dense.py | 26 +- .../topi/python/test_topi_dense_tensorcore.py | 10 +- .../topi/python/test_topi_depth_to_space.py | 10 +- .../topi/python/test_topi_depthwise_conv2d.py | 130 ++-- .../test_topi_depthwise_conv2d_back_input.py | 10 +- .../test_topi_depthwise_conv2d_back_weight.py | 10 +- tests/python/topi/python/test_topi_dilate.py | 6 +- tests/python/topi/python/test_topi_einsum.py | 6 +- .../topi/python/test_topi_group_conv2d.py | 90 +-- .../test_topi_group_conv2d_NCHWc_int8.py | 10 +- tests/python/topi/python/test_topi_image.py | 96 +-- tests/python/topi/python/test_topi_lrn.py | 6 +- tests/python/topi/python/test_topi_math.py | 86 +-- tests/python/topi/python/test_topi_matmul.py | 6 +- tests/python/topi/python/test_topi_pooling.py | 114 ++-- tests/python/topi/python/test_topi_prng.py | 40 +- tests/python/topi/python/test_topi_qnn.py | 52 +- tests/python/topi/python/test_topi_reduce.py | 16 +- tests/python/topi/python/test_topi_relu.py | 36 +- tests/python/topi/python/test_topi_reorg.py | 6 +- tests/python/topi/python/test_topi_scan.py | 34 +- tests/python/topi/python/test_topi_scatter.py | 6 +- tests/python/topi/python/test_topi_softmax.py | 26 +- tests/python/topi/python/test_topi_sort.py | 72 +- .../python/test_topi_space_to_batch_nd.py | 18 +- .../topi/python/test_topi_space_to_depth.py | 10 +- tests/python/topi/python/test_topi_sparse.py | 82 +-- tests/python/topi/python/test_topi_tensor.py | 62 +- .../python/topi/python/test_topi_transform.py | 646 +++++++++--------- tests/python/topi/python/test_topi_unique.py | 18 +- .../topi/python/test_topi_upsampling.py | 36 +- tests/python/topi/python/test_topi_vision.py | 190 +++--- .../test_auto_scheduler_layout_rewrite.py | 24 +- .../test_auto_scheduler_search_policy.py | 2 +- tests/python/unittest/test_crt.py | 28 +- tests/python/unittest/test_link_params.py | 2 +- .../python/unittest/test_runtime_container.py | 2 +- tests/python/unittest/test_runtime_graph.py | 6 +- .../unittest/test_runtime_graph_cuda_graph.py | 4 +- .../unittest/test_runtime_graph_debug.py | 6 +- .../unittest/test_runtime_heterogeneous.py | 22 +- .../test_runtime_module_based_interface.py | 88 +-- .../unittest/test_runtime_module_load.py | 24 +- tests/python/unittest/test_runtime_rpc.py | 38 +- .../unittest/test_runtime_vm_profiler.py | 4 +- .../unittest/test_target_codegen_blob.py | 14 +- .../unittest/test_target_codegen_bool.py | 16 +- .../unittest/test_target_codegen_c_host.py | 40 +- .../test_target_codegen_cross_llvm.py | 8 +- .../unittest/test_target_codegen_cuda.py | 150 ++-- .../unittest/test_target_codegen_device.py | 12 +- .../unittest/test_target_codegen_extern.py | 18 +- .../unittest/test_target_codegen_llvm.py | 72 +- .../unittest/test_target_codegen_opencl.py | 70 +- .../unittest/test_target_codegen_rocm.py | 36 +- .../unittest/test_target_codegen_spirv.py | 10 +- .../unittest/test_target_codegen_vulkan.py | 18 +- tests/python/unittest/test_te_autodiff.py | 2 +- .../python/unittest/test_te_hybrid_script.py | 6 +- ...hedule_postproc_rewrite_for_tensor_core.py | 20 +- .../unittest/test_te_schedule_tensor_core.py | 20 +- .../unittest/test_te_tensor_overload.py | 34 +- tests/python/unittest/test_tir_buffer.py | 32 +- tests/python/unittest/test_tir_intrin.py | 28 +- tests/python/unittest/test_tir_ir_builder.py | 40 +- .../unittest/test_tir_transform_hoist_if.py | 10 +- ...tir_transform_instrument_bound_checkers.py | 132 ++-- .../test_tir_transform_loop_partition.py | 8 +- .../test_tir_transform_lower_warp_memory.py | 26 +- .../auto_scheduler/tune_conv2d_layer_cuda.py | 12 +- tutorials/auto_scheduler/tune_network_arm.py | 6 +- tutorials/auto_scheduler/tune_network_cuda.py | 6 +- tutorials/auto_scheduler/tune_network_mali.py | 16 +- tutorials/auto_scheduler/tune_network_x86.py | 6 +- tutorials/auto_scheduler/tune_sparse_x86.py | 16 +- tutorials/autotvm/tune_conv2d_cuda.py | 10 +- tutorials/autotvm/tune_relay_arm.py | 6 +- tutorials/autotvm/tune_relay_cuda.py | 6 +- tutorials/autotvm/tune_relay_mobile_gpu.py | 6 +- tutorials/autotvm/tune_relay_x86.py | 6 +- tutorials/frontend/build_gcn.py | 4 +- tutorials/frontend/deploy_model_on_android.py | 12 +- tutorials/frontend/deploy_model_on_rasp.py | 4 +- .../deploy_object_detection_pytorch.py | 4 +- tutorials/frontend/deploy_prequantized.py | 6 +- .../frontend/deploy_prequantized_tflite.py | 4 +- tutorials/frontend/deploy_quantized.py | 4 +- tutorials/frontend/deploy_sparse.py | 20 +- tutorials/frontend/deploy_ssd_gluoncv.py | 12 +- tutorials/frontend/from_caffe2.py | 4 +- tutorials/frontend/from_coreml.py | 4 +- tutorials/frontend/from_darknet.py | 4 +- tutorials/frontend/from_keras.py | 4 +- tutorials/frontend/from_mxnet.py | 4 +- tutorials/frontend/from_pytorch.py | 4 +- tutorials/frontend/from_tensorflow.py | 6 +- tutorials/frontend/using_external_lib.py | 8 +- .../get_started/cross_compilation_and_rpc.py | 14 +- tutorials/get_started/relay_quick_start.py | 6 +- .../get_started/tensor_expr_get_started.py | 18 +- tutorials/get_started/tune_matmul_x86.py | 12 +- tutorials/language/extern_op.py | 12 +- tutorials/language/reduction.py | 6 +- tutorials/language/scan.py | 6 +- tutorials/language/tensorize.py | 10 +- tutorials/micro/micro_tflite.py | 2 +- tutorials/optimize/opt_conv_cuda.py | 10 +- tutorials/optimize/opt_conv_tensorcore.py | 12 +- tutorials/optimize/opt_gemm.py | 34 +- .../optimize/opt_matmul_auto_tensorcore.py | 12 +- tutorials/topi/intro_topi.py | 8 +- vta/python/vta/top/graphpack.py | 10 +- vta/runtime/device_api.cc | 28 +- .../python/integration/test_benchmark_gemm.py | 10 +- .../integration/test_benchmark_topi_conv2d.py | 12 +- .../test_benchmark_topi_conv2d_transpose.py | 10 +- .../integration/test_benchmark_topi_dense.py | 10 +- .../test_benchmark_topi_group_conv2d.py | 12 +- vta/tests/python/unittest/test_vta_insn.py | 44 +- web/emcc/tvmjs_support.cc | 34 +- web/emcc/webgpu_runtime.cc | 43 +- web/src/ctypes.ts | 4 +- web/src/index.ts | 2 +- web/src/runtime.ts | 76 +-- web/tests/node/test_ndarray.js | 2 +- web/tests/python/webgpu_rpc_test.py | 6 +- web/tests/python/websock_rpc_test.py | 8 +- 513 files changed, 5761 insertions(+), 5765 deletions(-) rename golang/src/{context.go => device.go} (58%) rename jvm/core/src/main/java/org/apache/tvm/{TVMContext.java => Device.java} (75%) rename jvm/core/src/main/java/org/apache/tvm/rpc/{TVMRemoteContext.java => TVMRemoteDevice.java} (86%) rename rust/tvm-rt/src/{context.rs => device.rs} (91%) rename rust/tvm-sys/src/{context.rs => device.rs} (71%) diff --git a/3rdparty/dlpack b/3rdparty/dlpack index 3ec04430e89a6..a07f962d446b5 160000 --- a/3rdparty/dlpack +++ b/3rdparty/dlpack @@ -1 +1 @@ -Subproject commit 3ec04430e89a6834e5a1b99471f415fa939bf642 +Subproject commit a07f962d446b577adf4baef2b347a0f3a2a20617 diff --git a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java index f598f8e698f8e..53913ef306dc0 100644 --- a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java +++ b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java @@ -57,7 +57,7 @@ import org.apache.tvm.Function; import org.apache.tvm.Module; import org.apache.tvm.NDArray; -import org.apache.tvm.TVMContext; +import org.apache.tvm.Device; import org.apache.tvm.TVMType; import org.apache.tvm.TVMValue; import org.json.JSONException; @@ -571,9 +571,9 @@ protected Integer doInBackground(Void... args) { return -1;//failure } - Log.i(TAG, "creating java tvm context..."); - // create java tvm context - TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu(); + Log.i(TAG, "creating java tvm device..."); + // create java tvm device + Device tvmDev = EXE_GPU ? Device.opencl() : Device.cpu(); Log.i(TAG, "loading compiled functions..."); Log.i(TAG, libCacheFilePath); @@ -587,13 +587,13 @@ protected Integer doInBackground(Void... args) { Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create"); Log.i(TAG, "creating graph runtime..."); - Log.i(TAG, "ctx type: " + tvmCtx.deviceType); - Log.i(TAG, "ctx id: " + tvmCtx.deviceId); + Log.i(TAG, "device type: " + tvmDev.deviceType); + Log.i(TAG, "device id: " + tvmDev.deviceId); TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph) .pushArg(modelLib) - .pushArg(tvmCtx.deviceType) - .pushArg(tvmCtx.deviceId) + .pushArg(tvmDev.deviceType) + .pushArg(tvmDev.deviceId) .invoke(); Log.i(TAG, "as module..."); diff --git a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java index 43a279b820d78..38c135a1edc49 100644 --- a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java +++ b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java @@ -56,7 +56,7 @@ import org.apache.tvm.Function; import org.apache.tvm.Module; import org.apache.tvm.NDArray; -import org.apache.tvm.TVMContext; +import org.apache.tvm.Device; import org.apache.tvm.TVMValue; import org.apache.tvm.TVMType; @@ -177,8 +177,8 @@ protected Integer doInBackground(Void... args) { return -1;//failure } - // create java tvm context - TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu(); + // create java tvm device + Device tvmDev = EXE_GPU ? Device.opencl() : Device.cpu(); // tvm module for compiled functions Module modelLib = Module.load(libCacheFilePath); @@ -187,8 +187,8 @@ protected Integer doInBackground(Void... args) { Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create"); TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph) .pushArg(modelLib) - .pushArg(tvmCtx.deviceType) - .pushArg(tvmCtx.deviceId) + .pushArg(tvmDev.deviceType) + .pushArg(tvmDev.deviceId) .invoke(); graphRuntimeModule = runtimeCreFunRes.asModule(); diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py index 9586bffeca0b0..0c0f429b516fd 100644 --- a/apps/android_rpc/tests/android_rpc_test.py +++ b/apps/android_rpc/tests/android_rpc_test.py @@ -68,12 +68,12 @@ def test_rpc_module(): # Execute the portable graph on cpu target print("Run CPU test ...") - ctx = remote.cpu(0) + dev = remote.cpu(0) remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + time_f = f2.time_evaluator(f2.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -91,12 +91,12 @@ def test_rpc_module(): f.export_library(path_dso_cl, ndk.create_shared) print("Run GPU(OpenCL Flavor) test ...") - ctx = remote.cl(0) + dev = remote.cl(0) remote.upload(path_dso_cl) f1 = remote.load_module("dev_lib_cl.so") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -114,12 +114,12 @@ def test_rpc_module(): f.export_library(path_dso_vulkan, ndk.create_shared) print("Run GPU(Vulkan Flavor) test ...") - ctx = remote.vulkan(0) + dev = remote.vulkan(0) remote.upload(path_dso_vulkan) f1 = remote.load_module("dev_lib_vulkan.so") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py index e7233370e6d68..915f2303b9ed2 100644 --- a/apps/benchmark/arm_cpu_imagenet_bench.py +++ b/apps/benchmark/arm_cpu_imagenet_bench.py @@ -54,17 +54,17 @@ def evaluate_network(network, target, target_host, repeat): # upload library and params print_progress("%-20s uploading..." % network) - ctx = remote.context(str(target), 0) + dev = remote.device(str(target), 0) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) - module = runtime.GraphModule(rlib["default"](ctx)) + module = runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print_progress("%-20s evaluating..." % network) - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat) prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)) diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py index b78476f98dc2c..6d91aff74fde3 100644 --- a/apps/benchmark/gpu_imagenet_bench.py +++ b/apps/benchmark/gpu_imagenet_bench.py @@ -37,13 +37,13 @@ def benchmark(network, target): lib = relay.build(net, target=target, params=params) # create runtime - ctx = tvm.context(str(target), 0) - module = runtime.GraphModule(lib["default"](ctx)) + dev = tvm.device(str(target), 0) + module = runtime.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=args.repeat) prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)) diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py index cf78c66141d06..3144aee080dc9 100644 --- a/apps/benchmark/mobile_gpu_imagenet_bench.py +++ b/apps/benchmark/mobile_gpu_imagenet_bench.py @@ -54,17 +54,17 @@ def evaluate_network(network, target, target_host, dtype, repeat): # upload library and params print_progress("%-20s uploading..." % network) - ctx = remote.context(str(target), 0) + dev = remote.device(str(target), 0) remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) - module = runtime.GraphModule(rlib["default"](ctx)) + module = runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print_progress("%-20s evaluating..." % network) - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat) prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond print( "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)) diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c index 098ac994223e3..84740aa251306 100644 --- a/apps/bundle_deploy/bundle.c +++ b/apps/bundle_deploy/bundle.c @@ -59,9 +59,9 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, params.data = params_data; params.size = params_size; - TVMContext ctx; - ctx.device_type = (DLDeviceType)device_type; - ctx.device_id = device_id; + DLDevice dev; + dev.device_type = (DLDeviceType)device_type; + dev.device_id = device_id; // declare pointers TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory), @@ -76,7 +76,7 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, // run modules TVMGraphRuntime* graph_runtime = NULL; - TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &ctx, &graph_runtime)); + TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &dev, &graph_runtime)); TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size)); return graph_runtime; @@ -116,12 +116,12 @@ void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) { exit(-1); } -tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) { - return g_memory_manager->Allocate(g_memory_manager, num_bytes, ctx, out_ptr); +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { + return g_memory_manager->Allocate(g_memory_manager, num_bytes, dev, out_ptr); } -tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) { - return g_memory_manager->Free(g_memory_manager, ptr, ctx); +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { + return g_memory_manager->Free(g_memory_manager, ptr, dev); } tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; } diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c index c4b637c3fc7b6..ca75b9e0b2e3e 100644 --- a/apps/bundle_deploy/bundle_static.c +++ b/apps/bundle_deploy/bundle_static.c @@ -59,9 +59,9 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, params.data = params_data; params.size = params_size; - TVMContext ctx; - ctx.device_type = (DLDeviceType)device_type; - ctx.device_id = device_id; + DLDevice dev; + dev.device_type = (DLDeviceType)device_type; + dev.device_id = device_id; // get pointers TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory), @@ -76,7 +76,7 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, // run modules TVMGraphRuntime* graph_runtime = NULL; - TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &ctx, &graph_runtime)); + TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &dev, &graph_runtime)); TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size)); return graph_runtime; @@ -117,12 +117,12 @@ void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) { exit(-1); } -tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) { - return g_memory_manager->Allocate(g_memory_manager, num_bytes, ctx, out_ptr); +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { + return g_memory_manager->Allocate(g_memory_manager, num_bytes, dev, out_ptr); } -tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) { - return g_memory_manager->Free(g_memory_manager, ptr, ctx); +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { + return g_memory_manager->Free(g_memory_manager, ptr, dev); } tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; } diff --git a/apps/bundle_deploy/demo.cc b/apps/bundle_deploy/demo.cc index 98f29d2833531..01107c70c3531 100644 --- a/apps/bundle_deploy/demo.cc +++ b/apps/bundle_deploy/demo.cc @@ -119,7 +119,7 @@ int main(int argc, char** argv) { std::vector input_shape = {1, 3, 224, 224}; DLTensor input; input.data = input_storage; - input.ctx = DLContext{kDLCPU, 0}; + input.device = DLDevice{kDLCPU, 0}; input.ndim = 4; input.dtype = DLDataType{kDLFloat, 32, 1}; input.shape = input_shape.data(); @@ -138,7 +138,7 @@ int main(int argc, char** argv) { std::vector output_shape = {1, 1000}; DLTensor output; output.data = output_storage; - output.ctx = DLContext{kDLCPU, 0}; + output.device = DLDevice{kDLCPU, 0}; output.ndim = 2; output.dtype = DLDataType{kDLFloat, 32, 1}; output.shape = output_shape.data(); diff --git a/apps/bundle_deploy/demo_static.c b/apps/bundle_deploy/demo_static.c index b25ad90a23887..a2fec6937311e 100644 --- a/apps/bundle_deploy/demo_static.c +++ b/apps/bundle_deploy/demo_static.c @@ -54,8 +54,8 @@ int main(int argc, char** argv) { DLTensor input; input.data = input_storage; - DLContext ctx = {kDLCPU, 0}; - input.ctx = ctx; + DLDevice dev = {kDLCPU, 0}; + input.device = dev; input.ndim = 4; DLDataType dtype = {kDLFloat, 32, 1}; input.dtype = dtype; @@ -73,8 +73,8 @@ int main(int argc, char** argv) { float output_storage[OUTPUT_LEN]; DLTensor output; output.data = output_storage; - DLContext out_ctx = {kDLCPU, 0}; - output.ctx = out_ctx; + DLDevice out_dev = {kDLCPU, 0}; + output.device = out_dev; output.ndim = 2; DLDataType out_dtype = {kDLFloat, 32, 1}; output.dtype = out_dtype; diff --git a/apps/bundle_deploy/test.cc b/apps/bundle_deploy/test.cc index c1a7f5d45377e..270047611f573 100644 --- a/apps/bundle_deploy/test.cc +++ b/apps/bundle_deploy/test.cc @@ -114,7 +114,7 @@ int main(int argc, char** argv) { std::vector input_shape = {10, 5}; DLTensor input; input.data = input_storage; - input.ctx = DLContext{kDLCPU, 0}; + input.device = DLDevice{kDLCPU, 0}; input.ndim = 2; input.dtype = DLDataType{kDLFloat, 32, 1}; input.shape = input_shape.data(); @@ -133,7 +133,7 @@ int main(int argc, char** argv) { std::vector output_shape = {10, 5}; DLTensor output; output.data = output_storage; - output.ctx = DLContext{kDLCPU, 0}; + output.device = DLDevice{kDLCPU, 0}; output.ndim = 2; output.dtype = DLDataType{kDLFloat, 32, 1}; output.shape = output_shape.data(); diff --git a/apps/bundle_deploy/test_static.c b/apps/bundle_deploy/test_static.c index 11ca2c44952ef..b9c980843ea13 100644 --- a/apps/bundle_deploy/test_static.c +++ b/apps/bundle_deploy/test_static.c @@ -66,8 +66,8 @@ int main(int argc, char** argv) { DLTensor input; input.data = input_storage; - DLContext ctx = {kDLCPU, 0}; - input.ctx = ctx; + DLDevice dev = {kDLCPU, 0}; + input.device = dev; input.ndim = 2; DLDataType dtype = {kDLFloat, 32, 1}; input.dtype = dtype; @@ -85,8 +85,8 @@ int main(int argc, char** argv) { float output_storage[10 * 5]; DLTensor output; output.data = output_storage; - DLContext out_ctx = {kDLCPU, 0}; - output.ctx = out_ctx; + DLDevice out_dev = {kDLCPU, 0}; + output.device = out_dev; output.ndim = 2; DLDataType out_dtype = {kDLFloat, 32, 1}; output.dtype = out_dtype; diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py index c73e820c19ad6..a01f97c349ca1 100644 --- a/apps/extension/tests/test_ext.py +++ b/apps/extension/tests/test_ext.py @@ -40,10 +40,10 @@ def check_llvm(): if not tvm.testing.device_enabled("llvm"): return f = tvm.build(s, [A, B], "ext_dev", "llvm") - ctx = tvm.ext_dev(0) + dev = tvm.ext_dev(0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1) @@ -87,10 +87,10 @@ def check_llvm(): if not tvm.testing.device_enabled("llvm"): return f = tvm.build(s, [A, B], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1) diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc index 829241d31a6d3..f58648c2fb7a1 100644 --- a/apps/howto_deploy/cpp_deploy.cc +++ b/apps/howto_deploy/cpp_deploy.cc @@ -86,17 +86,17 @@ void DeploySingleOp() { void DeployGraphRuntime() { LOG(INFO) << "Running graph runtime..."; // load in the library - DLContext ctx{kDLCPU, 0}; + DLDevice dev{kDLCPU, 0}; tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/test_relay_add.so"); // create the graph runtime module - tvm::runtime::Module gmod = mod_factory.GetFunction("default")(ctx); + tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev); tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input"); tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output"); tvm::runtime::PackedFunc run = gmod.GetFunction("run"); // Use the C++ API - tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, ctx); - tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, ctx); + tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, dev); + tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, dev); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 2; ++j) { diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py index 90ac6bfb92189..50468d3011343 100644 --- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py +++ b/apps/ios_rpc/tests/ios_rpc_mobilenet.py @@ -116,20 +116,20 @@ def run(mod, target): remote = rpc.connect(proxy_host, proxy_port, key=key) if target == "metal": - ctx = remote.metal(0) + dev = remote.metal(0) else: - ctx = remote.cpu(0) + dev = remote.cpu(0) lib = remote.load_module("deploy.dylib") - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) - m.set_input("data", tvm.nd.array(image, ctx)) + m.set_input("data", tvm.nd.array(image, dev)) m.run() tvm_output = m.get_output(0) top1 = np.argmax(tvm_output.asnumpy()[0]) print("TVM prediction top-1:", top1, synset[top1]) # evaluate - ftimer = m.module.time_evaluator("run", ctx, number=3, repeat=10) + ftimer = m.module.time_evaluator("run", dev, number=3, repeat=10) prof_res = np.array(ftimer().results) * 1000 print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))) diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py index a967c2f75e61c..865cdb0b8e432 100644 --- a/apps/ios_rpc/tests/ios_rpc_test.py +++ b/apps/ios_rpc/tests/ios_rpc_test.py @@ -88,22 +88,22 @@ def test_rpc_module(): # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) - ctx = remote.metal(0) + dev = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU - ctx = remote.cpu(0) + dev = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + time_f = f2.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) diff --git a/apps/topi_recipe/conv/depthwise_conv2d_test.py b/apps/topi_recipe/conv/depthwise_conv2d_test.py index 94687edde5f90..e282e67af7172 100644 --- a/apps/topi_recipe/conv/depthwise_conv2d_test.py +++ b/apps/topi_recipe/conv/depthwise_conv2d_test.py @@ -95,32 +95,32 @@ def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - scale_tvm = tvm.nd.array(scale_np, ctx) - shift_tvm = tvm.nd.array(shift_np, ctx) + input_tvm = tvm.nd.array(input_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) + scale_tvm = tvm.nd.array(scale_np, dev) + shift_tvm = tvm.nd.array(shift_np, dev) depthwise_conv2d_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx + np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev ) scale_shift_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx + np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev ) - relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) + relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev) # Measure time cost of kernel 1 (depthwise_conv2d) - timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) + timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) - timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) + timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) - timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) + timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) @@ -203,31 +203,31 @@ def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - scale_tvm = tvm.nd.array(scale_np, ctx) - shift_tvm = tvm.nd.array(shift_np, ctx) + input_tvm = tvm.nd.array(input_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) + scale_tvm = tvm.nd.array(scale_np, dev) + shift_tvm = tvm.nd.array(shift_np, dev) depthwise_conv2d_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx + np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev ) scale_shift_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx + np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev ) - relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) + relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev) # Measure time cost of kernel 1 (depthwise_conv2d) - timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) + timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) - timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) + timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) - timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) + timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) diff --git a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py index d67bfdc8952e8..a2394a7279c00 100644 --- a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py +++ b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py @@ -75,11 +75,11 @@ def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.context(device, 0) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + dev = tvm.device(device, 0) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) with tvm.transform.PassContext( config={ diff --git a/apps/topi_recipe/conv/test_conv_int8_arm.py b/apps/topi_recipe/conv/test_conv_int8_arm.py index 289e69ab9b5bf..4e82629285680 100644 --- a/apps/topi_recipe/conv/test_conv_int8_arm.py +++ b/apps/topi_recipe/conv/test_conv_int8_arm.py @@ -61,7 +61,7 @@ TARGET_NAME = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod" NUM_VEC_LANES = 16 -CTX = tvm.context(TARGET_NAME, 0) +DEV = tvm.device(TARGET_NAME, 0) def get_shape( @@ -136,16 +136,16 @@ def run_inference( # Create the numpy arrays to be used for executing conv models if data_dtype == "float32": - data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) - kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) + data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), DEV) + kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), DEV) else: data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # c_orig will be used for declaration ouptut # c_sch will be used for scheduled computation output - c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) - c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) + c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV) + c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV) with tvm.target.Target(TARGET_NAME): if out_dtype == "float32": @@ -190,7 +190,7 @@ def run_inference( else: assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) - evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) + evaluator = func.time_evaluator(func.entry_name, DEV, number=1000) LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(data_array, kernel_array, c_sch).mean diff --git a/apps/topi_recipe/conv/test_conv_int8_intel.py b/apps/topi_recipe/conv/test_conv_int8_intel.py index 562812ab8d822..b46d80fff821e 100644 --- a/apps/topi_recipe/conv/test_conv_int8_intel.py +++ b/apps/topi_recipe/conv/test_conv_int8_intel.py @@ -61,7 +61,7 @@ TARGET_NAME = "llvm -mcpu=skylake-avx512" NUM_VEC_LANES = 16 -CTX = tvm.context(TARGET_NAME, 0) +DEV = tvm.device(TARGET_NAME, 0) def get_shape( @@ -137,16 +137,16 @@ def run_inference( # Create the numpy arrays to be used for executing conv models if data_dtype == "float32": - data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) - kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) + data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), DEV) + kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), DEV) else: data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # c_orig will be used for declaration ouptut # c_sch will be used for scheduled computation output - c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) - c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) + c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV) + c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV) with tvm.target.Target(TARGET_NAME): conv = topi.nn.conv2d_NCHWc( @@ -176,7 +176,7 @@ def run_inference( else: assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) - evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) + evaluator = func.time_evaluator(func.entry_name, DEV, number=1000) LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(data_array, kernel_array, c_sch).mean diff --git a/apps/topi_recipe/gemm/android_gemm_square.py b/apps/topi_recipe/gemm/android_gemm_square.py index 0e64dcd3844d4..41370c677b38f 100644 --- a/apps/topi_recipe/gemm/android_gemm_square.py +++ b/apps/topi_recipe/gemm/android_gemm_square.py @@ -40,14 +40,14 @@ def ngflops(N): dtype = "float32" -def evaluate(func, ctx, N, times): +def evaluate(func, dev, N, times): a_np = np.random.uniform(size=(N, N)).astype(dtype) b_np = np.random.uniform(size=(N, N)).astype(dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((N, N), dtype=dtype), dev) - time_f = func.time_evaluator(func.entry_name, ctx, number=times) + time_f = func.time_evaluator(func.entry_name, dev, number=times) cost = time_f(a, b, c).mean gf = ngflops(N) / cost print("%g secs/op, %g GFLOPS" % (cost, gf)) @@ -127,11 +127,11 @@ def test_gemm_gpu(N, times, bn, num_block, num_thread): # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) - ctx = remote.cl(0) + dev = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") - evaluate(f, ctx, N, times) + evaluate(f, dev, N, times) if __name__ == "__main__": diff --git a/apps/topi_recipe/gemm/cuda_gemm_square.py b/apps/topi_recipe/gemm/cuda_gemm_square.py index 0d548dc0b5549..d84deea86e826 100644 --- a/apps/topi_recipe/gemm/cuda_gemm_square.py +++ b/apps/topi_recipe/gemm/cuda_gemm_square.py @@ -121,8 +121,8 @@ def test_gemm(): s[BB].double_buffer() # correctness def check_device(device): - ctx = tvm.context(device, 0) - if not ctx.exist: + dev = tvm.device(device, 0) + if not dev.exist: print("Skip because %s is not enabled" % device) return print("Device %s" % device) @@ -131,16 +131,16 @@ def check_device(device): n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) for i in range(2): f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * nn * nn num_runs = 10 - timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) + timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) diff --git a/apps/topi_recipe/gemm/gemm_int8.py b/apps/topi_recipe/gemm/gemm_int8.py index 0d0941d4217d6..ff3fa81f20d3f 100644 --- a/apps/topi_recipe/gemm/gemm_int8.py +++ b/apps/topi_recipe/gemm/gemm_int8.py @@ -160,14 +160,14 @@ def block_size_filter(entity): s, arg_bufs = gemm_int8(n, m, l) f = tvm.build(s, arg_bufs, "cuda", name="gemm_int8") - ctx = tvm.context("cuda", 0) + dev = tvm.device("cuda", 0) a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype="int8") b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype="int8") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((n, m), dtype="int32"), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((n, m), dtype="int32"), dev) f(a, b, c) tvm.testing.assert_allclose( @@ -176,7 +176,7 @@ def block_size_filter(entity): num_ops = 2 * l * m * n num_runs = 1000 - timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) + timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs) t = timer_f(a, b, c).mean GOPS = num_ops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GOPS." % (num_runs, t * 1e3, GOPS)) diff --git a/apps/topi_recipe/reduce/test_reduce_map.py b/apps/topi_recipe/reduce/test_reduce_map.py index 00b1ca2676df9..0a78e5bedb58e 100644 --- a/apps/topi_recipe/reduce/test_reduce_map.py +++ b/apps/topi_recipe/reduce/test_reduce_map.py @@ -78,8 +78,8 @@ def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): else: raise NotImplementedError - data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) - out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) + data_tvm = tvm.nd.array(in_npy, device=tvm.gpu()) + out_tvm = tvm.nd.empty(shape=out_npy.shape, device=tvm.gpu()) for _ in range(2): fcuda(data_tvm, out_tvm) diff --git a/apps/topi_recipe/rnn/lstm.py b/apps/topi_recipe/rnn/lstm.py index 701797e18dbf6..e4b7fbade3873 100644 --- a/apps/topi_recipe/rnn/lstm.py +++ b/apps/topi_recipe/rnn/lstm.py @@ -171,20 +171,20 @@ def lstm(): def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) - ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) + dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") - scan_h_a = tvm.nd.array(scan_h_np, ctx) - scan_c_a = tvm.nd.array(scan_c_np, ctx) - Xi2h_a = tvm.nd.array(Xi2h_np, ctx) - Wh2h_a = tvm.nd.array(Wh2h_np, ctx) + scan_h_a = tvm.nd.array(scan_h_np, dev) + scan_c_a = tvm.nd.array(scan_c_np, dev) + Xi2h_a = tvm.nd.array(Xi2h_np, dev) + Wh2h_a = tvm.nd.array(Wh2h_np, dev) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) - ctx.sync() + dev.sync() # measure time cost of second step. - evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) + evaluator = flstm.time_evaluator(flstm.entry_name, dev, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean) diff --git a/apps/topi_recipe/rnn/matexp.py b/apps/topi_recipe/rnn/matexp.py index e2cea9b31a9d1..ecf868cb5646d 100644 --- a/apps/topi_recipe/rnn/matexp.py +++ b/apps/topi_recipe/rnn/matexp.py @@ -140,22 +140,22 @@ def check_device(target): } ): f = tvm.build(s, [s_scan, Whh], target) - ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) + dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros((n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden // 2 :] = 0 - res_a = tvm.nd.array(res_np, ctx) - Whh_a = tvm.nd.array(Whh_np, ctx) + res_a = tvm.nd.array(res_np, dev) + Whh_a = tvm.nd.array(Whh_np, dev) # Skip first pass as it is compilation f(res_a, Whh_a) - ctx.sync() + dev.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) - ctx.sync() + dev.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness diff --git a/apps/wasm-standalone/wasm-graph/src/types.rs b/apps/wasm-standalone/wasm-graph/src/types.rs index 9d4dff96d189b..a3761a758cffa 100644 --- a/apps/wasm-standalone/wasm-graph/src/types.rs +++ b/apps/wasm-standalone/wasm-graph/src/types.rs @@ -24,7 +24,7 @@ use std::{ }; pub use tvm_sys::ffi::DLTensor; use tvm_sys::ffi::{ - DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDeviceType_kDLCPU, + DLDevice, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDeviceType_kDLCPU, }; #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] @@ -114,7 +114,7 @@ impl Tensor { pub fn as_dltensor(&self) -> DLTensor { DLTensor { data: self.data.as_ptr() as *mut c_void, - ctx: DLContext { + device: DLDevice { device_type: DLDeviceType_kDLCPU, device_id: 0 as c_int, }, diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst index f6de6c1580011..0ed2ce4ca9e1b 100644 --- a/docs/contribute/code_guide.rst +++ b/docs/contribute/code_guide.rst @@ -91,7 +91,7 @@ If you want your test to run over a variety of targets, use the :py:func:`tvm.te .. code:: python @tvm.testing.parametrize_targets - def test_mytest(target, ctx): + def test_mytest(target, dev): ... will run ``test_mytest`` with ``target="llvm"``, ``target="cuda"``, and few others. This also ensures that your test is run on the correct hardware by the CI. If you only want to test against a couple targets use ``@tvm.testing.parametrize_targets("target_1", "target_2")``. If you want to test on a single target, use the associated decorator from :py:func:`tvm.testing`. For example, CUDA tests use the ``@tvm.testing.requires_cuda`` decorator. diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index 5d11241c1a344..10a0d51e4f915 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -148,9 +148,9 @@ https://tvm.apache.org/docs/tutorials/get_started/cross_compilation_and_rpc.html .. code:: python - ctx = tvm.cpu(0) + dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module('lib_acl.so') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) d_data = np.random.uniform(0, 1, data_shape).astype(data_type) map_inputs = {'data': d_data} gen_module.set_input(**map_inputs) diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst index cb15a4f3bd545..6e20f3c2cdbb8 100644 --- a/docs/deploy/bnns.rst +++ b/docs/deploy/bnns.rst @@ -147,9 +147,9 @@ Load module and run inference on the target machine with TVM built with ``USE_B import numpy as np from tvm.contrib import graph_runtime - ctx = tvm.cpu(0) + dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module('compiled.dylib') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) dtype = "float32" input_shape = (1, 3, 224, 224) diff --git a/docs/deploy/hls.rst b/docs/deploy/hls.rst index a8faf6453a0f2..3c735e829936f 100644 --- a/docs/deploy/hls.rst +++ b/docs/deploy/hls.rst @@ -71,12 +71,12 @@ We use two python scripts for this tutorial. fadd_dev = tvm.runtime.load_module("myadd.awsxclbin") fadd.import_module(fadd_dev) - ctx = tvm.context(tgt, 0) + dev = tvm.device(tgt, 0) n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx) - c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev) + c = tvm.nd.array(np.zeros(n, dtype="float32"), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) diff --git a/docs/deploy/tensorrt.rst b/docs/deploy/tensorrt.rst index 27f11e9b53775..0732a32c01bf8 100644 --- a/docs/deploy/tensorrt.rst +++ b/docs/deploy/tensorrt.rst @@ -124,9 +124,9 @@ have to be built. .. code:: python - ctx = tvm.gpu(0) + dev = tvm.gpu(0) loaded_lib = tvm.runtime.load_module('compiled.so') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) input_data = np.random.uniform(0, 1, input_shape).astype(dtype) gen_module.run(data=input_data) diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst index 7de8f58ce54fe..fc887dae968c1 100755 --- a/docs/deploy/vitis_ai.rst +++ b/docs/deploy/vitis_ai.rst @@ -696,7 +696,7 @@ as root (execute ``su`` in terminal to log into root). import tvm from tvm.contrib import graph_runtime - ctx = tvm.cpu() + dev = tvm.cpu() # input_name = ... # input_data = ... @@ -704,6 +704,6 @@ as root (execute ``su`` in terminal to log into root). # load the module into memory lib = tvm.runtime.load_module("tvm_dpu_arm.so") - module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input(input_name, input_data) module.run() diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst index 0a21bb8909e72..90c0670e402f7 100644 --- a/docs/dev/codebase_walkthrough.rst +++ b/docs/dev/codebase_walkthrough.rst @@ -164,10 +164,10 @@ The returned module, which can be thought of as a combination of a compiled func :: - ctx = tvm.context(target, 0) - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + dev = tvm.device(target, 0) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) output = c.asnumpy() diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst index 4c49e926a8dff..509cfd306a4af 100644 --- a/docs/dev/debugger.rst +++ b/docs/dev/debugger.rst @@ -140,7 +140,7 @@ How to use Debugger? :: from tvm.contrib.debugger import debug_runtime as graph_runtime - m = graph_runtime.create(graph, lib, ctx, dump_root="/tmp/tvmdbg") + m = graph_runtime.create(graph, lib, dev, dump_root="/tmp/tvmdbg") # set inputs m.set_input('data', tvm.nd.array(data.astype(dtype))) m.set_input(**params) diff --git a/docs/dev/index.rst b/docs/dev/index.rst index a098df12f1c1e..7ceed646087f4 100644 --- a/docs/dev/index.rst +++ b/docs/dev/index.rst @@ -144,7 +144,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu import tvm # Example runtime execution program in python, with type annotated mod: tvm.runtime.Module = tvm.runtime.load_module("compiled_artifact.so") - arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], ctx=tvm.gpu(0)) + arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], device=tvm.gpu(0)) fun: tvm.runtime.PackedFunc = mod["addone"] fun(a) print(a.asnumpy()) diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst index 3fcd3365c82f1..b9f2337de2d41 100644 --- a/docs/dev/relay_bring_your_own_codegen.rst +++ b/docs/dev/relay_bring_your_own_codegen.rst @@ -757,10 +757,10 @@ Then, we implement ``ParseJson`` to parse a subgraph in ExampleJSON format and c entry.output = id; graph_[curr_subgraph].push_back(entry); // Note 2 } - DLContext ctx; - ctx.device_type = static_cast(1); - ctx.device_id = 0; - data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx); // Note 3 + DLDevice dev; + dev.device_type = static_cast(1); + dev.device_id = 0; + data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev); // Note 3 } } diff --git a/golang/sample/simple.go b/golang/sample/simple.go index f9b29e9a14925..7bb503db45981 100644 --- a/golang/sample/simple.go +++ b/golang/sample/simple.go @@ -49,7 +49,7 @@ func main() { // Allocate Array for inputs and outputs. - // Allocation by explicit type and context. + // Allocation by explicit type and device. tshapeIn := []int64{4} inX, _ := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0)) diff --git a/golang/src/array_test.go b/golang/src/array_test.go index 0dbc81412a362..a2636a8b0f20d 100644 --- a/golang/src/array_test.go +++ b/golang/src/array_test.go @@ -113,8 +113,8 @@ func TestArrayShape(t *testing.T) { } } -// Create an array and check created Context. -func TestArrayCtx(t *testing.T) { +// Create an array and check created Device. +func TestArrayDevice(t *testing.T) { // TODO: Could some test cases for other targets arr, err := Empty([]int64{4}, CPU(0)) if err != nil { @@ -122,13 +122,13 @@ func TestArrayCtx(t *testing.T) { return } - ctx := arr.GetCtx() - if ctx.DeviceType != KDLCPU { - t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType) + dev := arr.GetDevice() + if dev.DeviceType != KDLCPU { + t.Errorf("Dev DeviceType expected: %v Got :%v\n", KDLCPU, dev.DeviceType) return } - if ctx.DeviceID != 0 { - t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID) + if dev.DeviceID != 0 { + t.Errorf("Dev DeviceID expected: %v Got :%v\n", KDLCPU, dev.DeviceID) return } @@ -138,13 +138,13 @@ func TestArrayCtx(t *testing.T) { return } - ctx = arr.GetCtx() - if ctx.DeviceType != KDLCPU { - t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType) + dev = arr.GetDevice() + if dev.DeviceType != KDLCPU { + t.Errorf("Dev DeviceType expected: %v Got :%v\n", KDLCPU, dev.DeviceType) return } - if ctx.DeviceID != 2 { - t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID) + if dev.DeviceID != 2 { + t.Errorf("Dev DeviceID expected: %v Got :%v\n", KDLCPU, dev.DeviceID) return } } diff --git a/golang/src/context.go b/golang/src/device.go similarity index 58% rename from golang/src/context.go rename to golang/src/device.go index cc0acbc88876f..6569e44bf1ef9 100644 --- a/golang/src/context.go +++ b/golang/src/device.go @@ -18,8 +18,8 @@ */ /*! - * \brief gotvm package source for TVMContext interface - * \file context.go + * \brief gotvm package source for Device interface + * \file device.go */ package gotvm @@ -50,58 +50,58 @@ var KOpenGL = int32(C.kOpenGL) // KExtDev is golang enum correspond to TVM device type kDLExtDev. var KExtDev = int32(C.kDLExtDev) -// Context dtype corresponding to TVMContext aka DLContext -type Context struct { +// Device dtype corresponding to Device aka DLDevice +type Device struct { DeviceType int32 DeviceID int32 } -// CPU returns the Context object for CPU target on given index -func CPU(index int32) Context { - return Context{KDLCPU, index} +// CPU returns the Device object for CPU target on given index +func CPU(index int32) Device { + return Device{KDLCPU, index} } -// GPU returns the Context object for GPU target on given index -func GPU(index int32) Context { - return Context{KDLGPU, index} +// GPU returns the Device object for GPU target on given index +func GPU(index int32) Device { + return Device{KDLGPU, index} } -// CPUPinned returns the Context object for CPUPinned target on given index -func CPUPinned(index int32) Context { - return Context{KDLCPUPinned, index} +// CPUPinned returns the Device object for CPUPinned target on given index +func CPUPinned(index int32) Device { + return Device{KDLCPUPinned, index} } -// OpenCL returns the Context object for OpenCL target on given index -func OpenCL(index int32) Context { - return Context{KDLOpenCL, index} +// OpenCL returns the Device object for OpenCL target on given index +func OpenCL(index int32) Device { + return Device{KDLOpenCL, index} } -// Metal returns the Context object for Metal target on given index -func Metal(index int32) Context { - return Context{KDLMetal, index} +// Metal returns the Device object for Metal target on given index +func Metal(index int32) Device { + return Device{KDLMetal, index} } -// VPI returns the Context object for VPI target on given index -func VPI(index int32) Context { - return Context{KDLVPI, index} +// VPI returns the Device object for VPI target on given index +func VPI(index int32) Device { + return Device{KDLVPI, index} } -// ROCM returns the Context object for ROCM target on given index -func ROCM(index int32) Context { - return Context{KDLROCM, index} +// ROCM returns the Device object for ROCM target on given index +func ROCM(index int32) Device { + return Device{KDLROCM, index} } -// SDAccel returns the Context object for SDAccel target on given index -func SDAccel(index int32) Context { - return Context{KDLSDAccel, index} +// SDAccel returns the Device object for SDAccel target on given index +func SDAccel(index int32) Device { + return Device{KDLSDAccel, index} } -// Vulkan returns the Context object for Vulkan target on given index -func Vulkan(index int32) Context { - return Context{KDLVulkan, index} +// Vulkan returns the Device object for Vulkan target on given index +func Vulkan(index int32) Device { + return Device{KDLVulkan, index} } -// OpenGL returns the Context object for OpenGL target on given index -func OpenGL(index int32) Context { - return Context{KOpenGL, index} +// OpenGL returns the Device object for OpenGL target on given index +func OpenGL(index int32) Device { + return Device{KOpenGL, index} } diff --git a/golang/src/ndarray.go b/golang/src/ndarray.go index e7471347405c9..b1e71aef56bdd 100644 --- a/golang/src/ndarray.go +++ b/golang/src/ndarray.go @@ -243,10 +243,10 @@ func (parray Array) GetDType() (retVal string) { return } -// GetCtx returns the number of dimentions in Array -func (parray Array) GetCtx() (retVal Context) { - ret := ((*C.DLTensor)(unsafe.Pointer(parray))).ctx - retVal = *(*Context)(unsafe.Pointer(&ret)) +// GetDevice returns the number of dimentions in Array +func (parray Array) GetDevice() (retVal Device) { + ret := ((*C.DLTensor)(unsafe.Pointer(parray))).device + retVal = *(*Device)(unsafe.Pointer(&ret)) return } @@ -289,12 +289,12 @@ func nativeTVMArrayAlloc(shape []int64, ndim int32, // // `args[0]` is string for data type. Default value is 'float32' // -// `args[1]` is Context. Default value is '{KDLCPU, 0}' +// `args[1]` is Device. Default value is '{KDLCPU, 0}' // // returns pointer to Array on successful execution and error if any. func Empty(shape []int64, args ...interface{}) (parray *Array, err error) { typeName := "float32" - ctx := Context{KDLCPU, 0} + dev := Device{KDLCPU, 0} if len(shape) < 1 { err = fmt.Errorf("Invalid shape for Array creation: %v", len(shape)) @@ -305,8 +305,8 @@ func Empty(shape []int64, args ...interface{}) (parray *Array, err error) { switch val.(type) { case string: typeName = args[i].(string) - case Context: - ctx = args[i].(Context) + case Device: + dev = args[i].(Device) default: err = fmt.Errorf("Invalid Optional Argument Type: %T", val) return @@ -320,7 +320,7 @@ func Empty(shape []int64, args ...interface{}) (parray *Array, err error) { ndim := int32(len(shape)) newArray, err := nativeTVMArrayAlloc(shape, ndim, int32(tvmType.code), int32(tvmType.bits), int32(tvmType.lanes), - ctx.DeviceType, ctx.DeviceID) + dev.DeviceType, dev.DeviceID) if err != nil { return } diff --git a/golang/src/value.go b/golang/src/value.go index a7db894374af9..450cf4866ab05 100644 --- a/golang/src/value.go +++ b/golang/src/value.go @@ -39,8 +39,8 @@ var KHandle = int32(C.kTVMOpaqueHandle) var KNull = int32(C.kTVMNullptr) // KTVMType is golang type code for TVM kTVMDataType. var KTVMType = int32(C.kTVMDataType) -// KTVMContext is golang type code for TVM kTVMContext. -var KTVMContext = int32(C.kTVMContext) +// KDLDevice is golang type code for TVM kDLDevice. +var KDLDevice = int32(C.kDLDevice) // KArrayHandle is golang type code for TVM kTVMDLTensorHandle. var KArrayHandle = int32(C.kTVMDLTensorHandle) // KObjectHandle is golang type code for TVM kTVMObjectHandle. diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h index f88b049940996..264f2609a4b6b 100644 --- a/include/tvm/relay/analysis.h +++ b/include/tvm/relay/analysis.h @@ -272,12 +272,12 @@ TVM_DLL Map> GetCalibrateOutputMap(const IRModule& mod * \brief Analyze the device context of each IR node in a given relay module. * * \param mod The module for analysis. - * \param default_context The default context used by unassigned IR nodes. + * \param default_device The default device used by unassigned IR nodes. * - * \return The mapping between an IR node and its associated context. + * \return The mapping between an IR node and its associated device. */ -TVM_DLL std::unordered_map -ContextAnalysis(const IRModule& mod, const TVMContext& default_context); +TVM_DLL std::unordered_map +ContextAnalysis(const IRModule& mod, const Device& default_device); } // namespace relay } // namespace tvm diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h index 8a41ab74658fe..e3fd5ae77193b 100644 --- a/include/tvm/relay/interpreter.h +++ b/include/tvm/relay/interpreter.h @@ -58,11 +58,11 @@ namespace relay { * Relay's semantics, but a readable and clear one. * * \param mod The function module. - * \param context The primary context that the interepreter runs on. + * \param device The primary device that the interepreter runs on. * \param target Compiler target flag to compile the functions on the context. * \return A function that takes in an expression and returns a value. */ -runtime::TypedPackedFunc CreateInterpreter(IRModule mod, DLContext context, +runtime::TypedPackedFunc CreateInterpreter(IRModule mod, Device device, Target target); /*! \brief The container type of Closures used by the interpreter. */ diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 59316a0bace05..44dba4d9c4632 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -108,7 +108,7 @@ typedef enum { kTVMOpaqueHandle = 3U, kTVMNullptr = 4U, kTVMDataType = 5U, - kTVMContext = 6U, + kDLDevice = 6U, kTVMDLTensorHandle = 7U, kTVMObjectHandle = 8U, kTVMModuleHandle = 9U, @@ -129,11 +129,6 @@ typedef enum { kTVMExtEnd = 128U, } TVMArgTypeCode; -/*! - * \brief The Device information, abstract away common device types. - */ -typedef DLContext TVMContext; - /*! \brief the array handle */ typedef DLTensor* TVMArrayHandle; @@ -147,7 +142,7 @@ typedef union { void* v_handle; const char* v_str; DLDataType v_type; - TVMContext v_ctx; + DLDevice v_device; } TVMValue; /*! @@ -382,8 +377,8 @@ TVM_DLL int TVMFuncRemoveGlobal(const char* name); * \param dtype_code The type code of the dtype * \param dtype_bits The number of bits of dtype * \param dtype_lanes The number of lanes in the dtype. - * \param device_type The device type of context - * \param device_id The device id of context. + * \param device_type The device type. + * \param device_id The device id. * \param out The output handle. * \return 0 when success, -1 when failure happens */ @@ -451,9 +446,9 @@ TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor); /*! * \brief Create a new runtime stream. * - * \param device_type The device type of context - * \param device_id The device id of context - * \param out The new stream handle + * \param device_type The device type. + * \param device_id The device id. + * \param out The new stream handle. * \return 0 when success, -1 when failure happens */ TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out); @@ -461,9 +456,9 @@ TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out /*! * \brief Free a created stream handle. * - * \param device_type The device type of context - * \param device_id The device id of context - * \param stream The stream to be freed + * \param device_type The device type. + * \param device_id The device id. + * \param stream The stream to be freed. * \return 0 when success, -1 when failure happens */ TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream); @@ -474,8 +469,8 @@ TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream * will use the setted stream handle. * The specific type of stream is runtime device dependent. * - * \param device_type The device type of context - * \param device_id The device id of context. + * \param device_type The device type. + * \param device_id The device id. * \param handle The stream handle. * \return 0 when success, -1 when failure happens */ @@ -484,8 +479,8 @@ TVM_DLL int TVMSetStream(int device_type, int device_id, TVMStreamHandle handle) /*! * \brief Wait until all computations on stream completes. * - * \param device_type The device type of context - * \param device_id The device id of context. + * \param device_type The device type. + * \param device_id The device id. * \param stream The stream to be synchronized. * \return 0 when success, -1 when failure happens */ @@ -494,8 +489,8 @@ TVM_DLL int TVMSynchronize(int device_type, int device_id, TVMStreamHandle strea /*! * \brief Synchronize two streams of execution. * - * \param device_type The device type of context - * \param device_id The device id of context + * \param device_type The device type. + * \param device_id The device id. * \param src The source stream to synchronize. * \param dst The destination stream to synchronize. * \return 0 when success, -1 when failure happens @@ -548,7 +543,7 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr); /*! * \brief Allocate a data space on device. - * \param ctx The device context to perform operation. + * \param dev The device to perform operation. * \param nbytes The number of bytes in memory. * \param alignment The alignment of the memory. * \param type_hint The type of elements. Only needed by certain backends such @@ -556,14 +551,14 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr); * \param out_data The allocated device pointer. * \return 0 when success, -1 when failure happens */ -TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, +TVM_DLL int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data); /*! * \brief Allocate a data space on device with special memory scope. * \note The memory could use a special multi-dimensional memory layout. * That is why we pass shape and dtype instead of raw number of bytes. - * \param ctx The device context to perform operation. + * \param dev The device to perform operation. * \param ndim The number of dimension of the tensor. * \param shape The shape of the tensor. * \param dtype The type of elements. @@ -572,17 +567,17 @@ TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignme * \param out_data The allocated device pointer. * \return 0 when success, -1 when failure happens */ -TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, +TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype, const char* mem_scope, void** out_data); /*! * \brief Free a data space on device. - * \param ctx The device context to perform operation. + * \param dev The device to perform operation. * \param ptr The data space. * \return 0 when success, -1 when failure happens */ -TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr); +TVM_DLL int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr); /*! * \brief Copy data from one place to another. diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_runtime.h index 82d7ac4e7d182..02c7421d00e8b 100644 --- a/include/tvm/runtime/crt/graph_runtime.h +++ b/include/tvm/runtime/crt/graph_runtime.h @@ -62,12 +62,12 @@ typedef struct TVMGraphRuntime TVMGraphRuntime; * * \param sym_json JSON-encoded graph. * \param module_handle TVM Module that exposes the functions to call. - * \param ctxs runtime execution context. + * \param devices runtime execution device. * \param runtime Pointer which receives a pointer to the newly-created instance. * \return 0 if successful. */ int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle, - const TVMContext* ctxs, TVMGraphRuntime** runtime); + const DLDevice* devices, TVMGraphRuntime** runtime); int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name); diff --git a/include/tvm/runtime/crt/memory.h b/include/tvm/runtime/crt/memory.h index 5b937fd7d4f46..c830116528e0a 100644 --- a/include/tvm/runtime/crt/memory.h +++ b/include/tvm/runtime/crt/memory.h @@ -42,11 +42,11 @@ struct MemoryManagerInterface { * \brief Allocate a chunk of memory. * \param interface Pointer to this structure. * \param num_bytes Number of bytes requested. - * \param ctx Execution context that will be used with the allocated memory. Must be {kDLCPU, 0}. + * \param dev Execution device that will be used with the allocated memory. Must be {kDLCPU, 0}. * \param out_ptr A pointer to which is written a pointer to the newly-allocated memory. * \return kTvmErrorNoError if successful; a descriptive error code otherwise. */ - tvm_crt_error_t (*Allocate)(MemoryManagerInterface* interface, size_t num_bytes, DLContext ctx, + tvm_crt_error_t (*Allocate)(MemoryManagerInterface* interface, size_t num_bytes, DLDevice dev, void** out_ptr); /*! @@ -54,10 +54,10 @@ struct MemoryManagerInterface { * * \param interface Pointer to this structure. * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd. - * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. + * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. * \return kTvmErrorNoError if successful; a descriptive error code otherwise. */ - tvm_crt_error_t (*Free)(MemoryManagerInterface* interface, void* ptr, DLContext ctx); + tvm_crt_error_t (*Free)(MemoryManagerInterface* interface, void* ptr, DLDevice dev); /*! \brief Used in testing; the number of allocated objects. */ int vleak_size; diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h index d1226e388f733..c774aaeaa0db5 100644 --- a/include/tvm/runtime/crt/platform.h +++ b/include/tvm/runtime/crt/platform.h @@ -64,20 +64,20 @@ size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const * and the caller is not obligated to call TVMPlatformMemoryFree in order to avoid a memory leak. * * \param num_bytes Number of bytes requested. - * \param ctx Execution context that will be used with the allocated memory. Fixed to {kDLCPU, 0}. + * \param dev Execution device that will be used with the allocated memory. Fixed to {kDLCPU, 0}. * \param out_ptr A pointer to which is written a pointer to the newly-allocated memory. * \return kTvmErrorNoError if successful; a descriptive error code otherwise. */ -tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr); +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr); /*! * \brief Free memory used by TVM. * * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd. - * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. + * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. * \return kTvmErrorNoError if successful; a descriptive error code otherwise. */ -tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx); +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev); /*! \brief Start a device timer. * diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 1276663a2bc38..3caea1041f835 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_DEVICE_API_H_ #include +#include #include #include @@ -67,46 +68,46 @@ class TVM_DLL DeviceAPI { /*! \brief virtual destructor */ virtual ~DeviceAPI() {} /*! - * \brief Set the environment device id to ctx - * \param ctx The context to be set. + * \brief Set the environment device id to device + * \param dev The device to be set. */ - virtual void SetDevice(TVMContext ctx) = 0; + virtual void SetDevice(Device dev) = 0; /*! * \brief Get attribute of specified device. - * \param ctx The device context + * \param dev The device device * \param kind The result kind * \param rv The return value. * \sa DeviceAttrKind */ - virtual void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) = 0; + virtual void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) = 0; /*! * \brief Allocate a data space on device. - * \param ctx The device context to perform operation. + * \param dev The device device to perform operation. * \param nbytes The number of bytes in memory. * \param alignment The alignment of the memory. * \param type_hint The type of elements. Only needed by certain backends such * as OpenGL, as nbytes & alignment are sufficient for most backends. * \return The allocated device pointer. */ - virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, + virtual void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) = 0; /*! * \brief Allocate a data space on device with memory scope support. - * \param ctx The device context to perform operation. + * \param dev The device device to perform operation. * \param ndim The number of dimension of allocated tensor. * \param shape The shape of allocated tensor. * \param dtype The type of elements. * \param mem_scope The memory scope of allocated tensor. * \return The allocated device pointer. */ - virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope = NullOpt); /*! * \brief Free a data space on device. - * \param ctx The device context to perform operation. + * \param dev The device device to perform operation. * \param ptr The data space. */ - virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0; + virtual void FreeDataSpace(Device dev, void* ptr) = 0; /*! * \brief copy data from one place to another * \note This API is designed to support special memory with shape dependent layout. @@ -119,44 +120,43 @@ class TVM_DLL DeviceAPI { /*! * \brief Create a new stream of execution. * - * \param ctx The context of allocation. + * \param dev The device of allocation. */ - virtual TVMStreamHandle CreateStream(TVMContext ctx); + virtual TVMStreamHandle CreateStream(Device dev); /*! * \brief Free a stream of execution * - * \param ctx The context of the stream + * \param dev The device of the stream * \param stream The pointer to be freed. */ - virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream); + virtual void FreeStream(Device dev, TVMStreamHandle stream); /*! * \brief Synchronize the stream - * \param ctx The context to perform operation. + * \param dev The device to perform operation. * \param stream The stream to be sync. */ - virtual void StreamSync(TVMContext ctx, TVMStreamHandle stream) = 0; + virtual void StreamSync(Device dev, TVMStreamHandle stream) = 0; /*! * \brief Set the stream - * \param ctx The context to set stream. + * \param dev The device to set stream. * \param stream The stream to be set. */ - virtual void SetStream(TVMContext ctx, TVMStreamHandle stream) {} + virtual void SetStream(Device dev, TVMStreamHandle stream) {} /*! * \brief Synchronize 2 streams of execution. * * An event is created in event_src stream that the second then * stream waits on. Neither event_src or event_dst need to be of - * the same device ID as the context, but they must be of the same + * the same device ID as the device, but they must be of the same * device type. * - * \param ctx The context of the streams. + * \param dev The device of the streams. * \param event_src The source stream to synchronize. * \param event_dst The destination stream to synchronize. */ - virtual void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, - TVMStreamHandle event_dst); + virtual void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst); /*! * \brief Allocate temporal workspace for backend execution. * @@ -168,34 +168,34 @@ class TVM_DLL DeviceAPI { * - Repeative pattern of same allocations over different runs. * - Workspace should not overlap between different threads(i.e. be threadlocal) * - * \param ctx The context of allocation. + * \param dev The device of allocation. * \param nbytes The size to be allocated. * \param type_hint The type of elements. Only needed by certain backends such * as OpenGL, as nbytes is sufficient for most backends. */ - virtual void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}); + virtual void* AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint = {}); /*! * \brief Free temporal workspace in backend execution. * - * \param ctx The context of allocation. + * \param dev The device of allocation. * \param ptr The pointer to be freed. */ - virtual void FreeWorkspace(TVMContext ctx, void* ptr); + virtual void FreeWorkspace(Device dev, void* ptr); /*! - * \brief Get device API based on context. - * \param ctx The context + * \brief Get device API based on device. + * \param dev The device * \param allow_missing Whether allow missing * \return The corresponding device API. */ - static DeviceAPI* Get(TVMContext ctx, bool allow_missing = false); + static DeviceAPI* Get(Device dev, bool allow_missing = false); /*! - * \brief Whether a certian device type requires set device context + * \brief Whether a certian device type requires set device device * before launching the kernel function. * \param device_type The device type. */ - static bool NeedSetDeviceContext(int device_type) { + static bool NeedSetDevice(int device_type) { return device_type != kDLCPU && device_type != kDLMicroDev; } @@ -207,14 +207,14 @@ class TVM_DLL DeviceAPI { * \param to The target array. * \param to_offset The byte offset in the to. * \param num_bytes The size of the memory in bytes - * \param ctx_from The source context - * \param ctx_to The target context + * \param dev_from The source device + * \param dev_to The target device * \param type_hint The type of elements, only neded by certain backends. * can be useful for cross device endian converison. * \param stream Optional stream object. */ virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream); }; @@ -263,56 +263,57 @@ inline const char* DeviceName(int type) { } /*! - * \brief Return true if a TVMContext is owned by an RPC session. + * \brief Return true if a Device is owned by an RPC session. */ -inline bool IsRPCSessionContext(TVMContext ctx) { return (ctx.device_type / kRPCSessMask) > 0; } +inline bool IsRPCSessionDevice(Device dev) { return (dev.device_type / kRPCSessMask) > 0; } /*! - * \brief Return the RPCSessTable index of the RPC Session that owns this context. + * \brief Return the RPCSessTable index of the RPC Session that owns this device. * \return the table index. */ -inline int GetRPCSessionIndex(TVMContext ctx) { - ICHECK(IsRPCSessionContext(ctx)) << "GetRPCSessionIndex: ctx has no RPC session"; - return ctx.device_type / kRPCSessMask - 1; +inline int GetRPCSessionIndex(Device dev) { + ICHECK(IsRPCSessionDevice(dev)) << "GetRPCSessionIndex: dev has no RPC session"; + return dev.device_type / kRPCSessMask - 1; } /*! - * \brief Remove the RPC session mask from a TVMContext. - * RPC clients typically do this when encoding a TVMContext for transmission to an RPC remote. - * On the wire, RPCContext are expected to be valid on the server without interpretation. - * \param ctx A TVMContext with non-zero RPC Session mask, valid on the RPC client. - * \return A TVMContext without any RPC Session mask, valid on the RPC server. + * \brief Remove the RPC session mask from a Device. + * RPC clients typically do this when encoding a Device for transmission to an RPC remote. + * On the wire, RPCdevice are expected to be valid on the server without interpretation. + * \param dev A Device with non-zero RPC Session mask, valid on the RPC client. + * \return A Device without any RPC Session mask, valid on the RPC server. */ -inline TVMContext RemoveRPCSessionMask(TVMContext ctx) { - ctx.device_type = static_cast(ctx.device_type % kRPCSessMask); - return ctx; +inline Device RemoveRPCSessionMask(Device dev) { + dev.device_type = static_cast(dev.device_type % kRPCSessMask); + return dev; } -inline std::ostream& operator<<(std::ostream& os, DLContext ctx); +inline std::ostream& operator<<(std::ostream& os, DLDevice dev); /*! - * \brief Add a RPC session mask to a TVMContext. - * RPC clients typically do this when decoding a TVMContext received from a RPC remote. - * \param ctx A TVMContext without any RPC Session mask, valid on the RPC server. + * \brief Add a RPC session mask to a Device. + * RPC clients typically do this when decoding a Device received from a RPC remote. + * \param dev A Device without any RPC Session mask, valid on the RPC server. * \param session_table_index Numeric index of the RPC session in the session table. - * \return A TVMContext with RPC session mask added, valid on the RPC client. + * \return A Device with RPC session mask added, valid on the RPC client. */ -inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) { - CHECK(!IsRPCSessionContext(ctx)) - << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx; - ctx.device_type = - static_cast(ctx.device_type | (kRPCSessMask * (session_table_index + 1))); - return ctx; +inline Device AddRPCSessionMask(Device dev, int session_table_index) { + CHECK(!IsRPCSessionDevice(dev)) << "AddRPCSessionMask: dev already non-zero RPCSessionIndex: " + << dev; + dev.device_type = + static_cast(dev.device_type | (kRPCSessMask * (session_table_index + 1))); + return dev; } -inline std::ostream& operator<<(std::ostream& os, DLContext ctx) { // NOLINT(*) - if (IsRPCSessionContext(ctx)) { - os << "remote[" << GetRPCSessionIndex(ctx) << "]-"; - ctx = RemoveRPCSessionMask(ctx); +inline std::ostream& operator<<(std::ostream& os, DLDevice dev) { // NOLINT(*) + if (IsRPCSessionDevice(dev)) { + os << "remote[" << GetRPCSessionIndex(dev) << "]-"; + dev = RemoveRPCSessionMask(dev); } - os << runtime::DeviceName(static_cast(ctx.device_type)) << "(" << ctx.device_id << ")"; + os << runtime::DeviceName(static_cast(dev.device_type)) << "(" << dev.device_id << ")"; return os; } } // namespace runtime } // namespace tvm + #endif // TVM_RUNTIME_DEVICE_API_H_ diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index a884b5c6838f5..ada9b74503bcd 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -31,12 +31,15 @@ #include #include +#include #include #include namespace tvm { namespace runtime { +typedef DLDevice Device; + /*! * \brief Managed NDArray. * The array is backed by reference counted blocks. @@ -101,11 +104,11 @@ class NDArray : public ObjectRef { */ TVM_DLL void CopyToBytes(void* data, size_t nbytes) const; /*! - * \brief Copy the data to another context. - * \param ctx The target context. - * \return The array under another context. + * \brief Copy the data to another device. + * \param dev The target device. + * \return The array under another device. */ - inline NDArray CopyTo(const DLContext& ctx) const; + inline NDArray CopyTo(const Device& dev) const; /*! * \brief Load NDArray from stream * \param stream The input data stream @@ -134,11 +137,11 @@ class NDArray : public ObjectRef { * \brief Create an empty NDArray. * \param shape The shape of the new array. * \param dtype The data type of the new array. - * \param ctx The context of the array. + * \param dev The device of the array. * \param mem_scope The memory scope of the array. * \return The created Array */ - TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx, + TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, Device dev, Optional mem_scope = NullOpt); /*! * \brief Create a NDArray backed by a dlpack tensor. @@ -256,7 +259,7 @@ class NDArray::Container : public Object, public NDArray::ContainerBase { dl_tensor.byte_offset = 0; } - Container(void* data, std::vector shape, DLDataType dtype, DLContext ctx) { + Container(void* data, std::vector shape, DLDataType dtype, Device dev) { // Initialize the type index. type_index_ = Container::RuntimeTypeIndex(); dl_tensor.data = data; @@ -266,7 +269,7 @@ class NDArray::Container : public Object, public NDArray::ContainerBase { dl_tensor.dtype = dtype; dl_tensor.strides = nullptr; dl_tensor.byte_offset = 0; - dl_tensor.ctx = ctx; + dl_tensor.device = dev; } /*! * \brief Set the deleter field. @@ -349,11 +352,11 @@ inline void NDArray::CopyTo(const NDArray& other) const { CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor)); } -inline NDArray NDArray::CopyTo(const DLContext& ctx) const { +inline NDArray NDArray::CopyTo(const Device& dev) const { ICHECK(data_ != nullptr); const DLTensor* dptr = operator->(); NDArray ret = - Empty(std::vector(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, ctx); + Empty(std::vector(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev); this->CopyTo(ret); return ret; } @@ -401,12 +404,12 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) { // This is used to prevent case when another user loads the parameters // back on machine that do not have GPU or related context. // - // We can always do array.CopyTo(target_ctx) to get a corresponding + // We can always do array.CopyTo(target_dev) to get a corresponding // array in the target context. - DLContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - strm->Write(cpu_ctx); + Device cpu_dev; + cpu_dev.device_type = kDLCPU; + cpu_dev.device_id = 0; + strm->Write(cpu_dev); strm->Write(tensor->ndim); strm->Write(tensor->dtype); int ndim = tensor->ndim; @@ -419,8 +422,8 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) { int64_t data_byte_size = type_bytes * num_elems; strm->Write(data_byte_size); - if (DMLC_IO_NO_ENDIAN_SWAP && tensor->ctx.device_type == kDLCPU && tensor->strides == nullptr && - tensor->byte_offset == 0) { + if (DMLC_IO_NO_ENDIAN_SWAP && tensor->device.device_type == kDLCPU && + tensor->strides == nullptr && tensor->byte_offset == 0) { // quick path strm->Write(tensor->data, data_byte_size); } else { @@ -444,18 +447,18 @@ inline bool NDArray::Load(dmlc::Stream* strm) { ICHECK(strm->Read(&header)) << "Invalid DLTensor file format"; ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format"; ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format"; - DLContext ctx; + Device dev; int ndim; DLDataType dtype; - ICHECK(strm->Read(&ctx)) << "Invalid DLTensor file format"; + ICHECK(strm->Read(&dev)) << "Invalid DLTensor file format"; ICHECK(strm->Read(&ndim)) << "Invalid DLTensor file format"; ICHECK(strm->Read(&dtype)) << "Invalid DLTensor file format"; - ICHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor"; + ICHECK_EQ(dev.device_type, kDLCPU) << "Invalid DLTensor device: can only save as CPU tensor"; std::vector shape(ndim); if (ndim != 0) { ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format"; } - NDArray ret = NDArray::Empty(shape, dtype, ctx); + NDArray ret = NDArray::Empty(shape, dtype, dev); int64_t num_elems = 1; int elem_bytes = (ret->dtype.bits + 7) / 8; for (int i = 0; i < ret->ndim; ++i) { @@ -477,5 +480,26 @@ inline bool NDArray::Load(dmlc::Stream* strm) { } } // namespace runtime + +// alias Device +using tvm::runtime::Device; + } // namespace tvm + +namespace std { +template <> +struct hash { + std::size_t operator()(const tvm::runtime::Device& dev) const { + return ((dev.device_id << 8) | dev.device_type); + } +}; + +template <> +struct equal_to { + bool operator()(const tvm::runtime::Device& lhs, const tvm::runtime::Device& rhs) const { + return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id); + } +}; +} // namespace std + #endif // TVM_RUNTIME_NDARRAY_H_ diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 7113863a6fb35..65f7e76162742 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -547,9 +547,9 @@ class TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle); return Module(ObjectPtr(static_cast(value_.v_handle))); } - operator TVMContext() const { - TVM_CHECK_TYPE_CODE(type_code_, kTVMContext); - return value_.v_ctx; + operator Device() const { + TVM_CHECK_TYPE_CODE(type_code_, kDLDevice); + return value_.v_device; } int type_code() const { return type_code_; } /*! @@ -606,7 +606,7 @@ class TVMArgValue : public TVMPODValue_ { using TVMPODValue_::operator void*; using TVMPODValue_::operator DLTensor*; using TVMPODValue_::operator NDArray; - using TVMPODValue_::operator TVMContext; + using TVMPODValue_::operator Device; using TVMPODValue_::operator Module; using TVMPODValue_::AsObjectRef; using TVMPODValue_::IsObjectRef; @@ -666,7 +666,7 @@ class TVMMovableArgValue_ : public TVMPODValue_ { using TVMPODValue_::operator void*; using TVMPODValue_::operator DLTensor*; using TVMPODValue_::operator NDArray; - using TVMPODValue_::operator TVMContext; + using TVMPODValue_::operator Device; using TVMPODValue_::operator Module; // reuse conversion rule from ArgValue. operator std::string() const { return AsArgValue().operator std::string(); } @@ -760,7 +760,7 @@ class TVMRetValue : public TVMPODValue_ { using TVMPODValue_::operator bool; using TVMPODValue_::operator void*; using TVMPODValue_::operator DLTensor*; - using TVMPODValue_::operator TVMContext; + using TVMPODValue_::operator Device; using TVMPODValue_::operator NDArray; using TVMPODValue_::operator Module; using TVMPODValue_::AsObjectRef; @@ -827,9 +827,9 @@ class TVMRetValue : public TVMPODValue_ { value_.v_int64 = value; return *this; } - TVMRetValue& operator=(TVMContext value) { - this->SwitchToPOD(kTVMContext); - value_.v_ctx = value; + TVMRetValue& operator=(DLDevice value) { + this->SwitchToPOD(kDLDevice); + value_.v_device = value; return *this; } TVMRetValue& operator=(DLDataType t) { @@ -1180,8 +1180,8 @@ inline const char* ArgTypeCode2Str(int type_code) { return "ArrayHandle"; case kTVMDataType: return "DLDataType"; - case kTVMContext: - return "TVMContext"; + case kDLDevice: + return "DLDevice"; case kTVMPackedFuncHandle: return "FunctionHandle"; case kTVMModuleHandle: @@ -1295,9 +1295,9 @@ class TVMArgsSetter { values_[i].v_handle = value; type_codes_[i] = kTVMDLTensorHandle; } - TVM_ALWAYS_INLINE void operator()(size_t i, TVMContext value) const { - values_[i].v_ctx = value; - type_codes_[i] = kTVMContext; + TVM_ALWAYS_INLINE void operator()(size_t i, Device value) const { + values_[i].v_device = value; + type_codes_[i] = kDLDevice; } TVM_ALWAYS_INLINE void operator()(size_t i, DLDataType value) const { values_[i].v_type = value; diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 45b60ea18acc6..fa16e7820b352 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -80,7 +80,7 @@ class Timer : public ObjectRef { public: /*! * \brief Get a device specific timer. - * \param ctx The device context to time. + * \param dev The device to time. * \return A `Timer` that has already been started. * * Use this function to time runtime of arbitrary regions of code on a specific @@ -95,7 +95,7 @@ class Timer : public ObjectRef { * * Example usage: * \code{.cpp} - * Timer t = Timer::Start(TVMContext::cpu()); + * Timer t = Timer::Start(Device::cpu()); * my_long_running_function(); * t->Stop(); * ... // some more computation @@ -104,7 +104,7 @@ class Timer : public ObjectRef { * * To add a new device-specific timer, register a new function * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your - * device). This function should accept a `TVMContext` and return a new `Timer` + * device). This function should accept a `Device` and return a new `Timer` * that has already been started. * * For example, this is how the CPU timer is implemented: @@ -125,24 +125,24 @@ class Timer : public ObjectRef { * }; * TVM_REGISTER_OBJECT_TYPE(CPUTimerNode); * - * TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) { + * TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) { * return Timer(make_object()); * }); * \endcode */ - static TVM_DLL Timer Start(TVMContext ctx); + static TVM_DLL Timer Start(Device dev); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Timer, ObjectRef, TimerNode); }; /*! - * \brief Default timer if one does not exist for the context. - * \param ctx The context to time on. + * \brief Default timer if one does not exist for the device. + * \param dev The device to time on. * * Note that this timer performs synchronization between the device and CPU, * which can lead to overhead in the reported results. */ -Timer DefaultTimer(TVMContext ctx); +Timer DefaultTimer(Device dev); } // namespace runtime } // namespace tvm diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h index f40c87ee07ec7..b35cad368832f 100644 --- a/include/tvm/runtime/serializer.h +++ b/include/tvm/runtime/serializer.h @@ -20,7 +20,7 @@ /*! * \file tvm/runtime/serializer.h * \brief Serializer extension to support TVM data types - * Include this file to enable serialization of DLDataType, DLContext + * Include this file to enable serialization of DLDataType, DLDevice */ #ifndef TVM_RUNTIME_SERIALIZER_H_ #define TVM_RUNTIME_SERIALIZER_H_ @@ -49,17 +49,17 @@ struct Handler { }; template <> -struct Handler { - inline static void Write(Stream* strm, const DLContext& ctx) { - int32_t device_type = static_cast(ctx.device_type); +struct Handler { + inline static void Write(Stream* strm, const DLDevice& dev) { + int32_t device_type = static_cast(dev.device_type); Handler::Write(strm, device_type); - Handler::Write(strm, ctx.device_id); + Handler::Write(strm, dev.device_id); } - inline static bool Read(Stream* strm, DLContext* ctx) { + inline static bool Read(Stream* strm, DLDevice* dev) { int32_t device_type = 0; if (!Handler::Read(strm, &(device_type))) return false; - ctx->device_type = static_cast(device_type); - if (!Handler::Read(strm, &(ctx->device_id))) return false; + dev->device_type = static_cast(device_type); + if (!Handler::Read(strm, &(dev->device_id))) return false; return true; } }; diff --git a/include/tvm/runtime/vm/memory_manager.h b/include/tvm/runtime/vm/memory_manager.h index c983cb066ec90..fb2354bca4ec4 100644 --- a/include/tvm/runtime/vm/memory_manager.h +++ b/include/tvm/runtime/vm/memory_manager.h @@ -34,23 +34,6 @@ #include #include -namespace std { -template <> -struct hash { - std::size_t operator()(const TVMContext& ctx) const { - return ((ctx.device_id << 8) | ctx.device_type); - } -}; - -template <> -struct equal_to { - bool operator()(const TVMContext& lhs, const TVMContext& rhs) const { - return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id); - } -}; - -} // namespace std - namespace tvm { namespace runtime { namespace vm { @@ -61,7 +44,7 @@ struct Buffer { /*! \brief The size of the block. */ size_t size{0}; /*! \brief The context of the allocated buffers. */ - TVMContext ctx; + Device device; }; enum AllocatorType { @@ -76,10 +59,10 @@ class Allocator { /*! \brief Allocate an empty NDArray using from the allocator. * \param shape The shape of the NDArray. * \param dtype The datatype of the NDArray. - * \param ctx The context where the array is allocated. + * \param dev The device where the array is allocated. * \return The empty NDArray. */ - NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx); + NDArray Empty(std::vector shape, DLDataType dtype, Device dev); /*! \brief Return the allocator type. */ inline AllocatorType type() const { return type_; } /*! \brief Allocate a buffer given a size, alignment and type. @@ -107,24 +90,24 @@ class MemoryManager { static MemoryManager* Global(); /*! * \brief Get or create an allocator given the context and allocator type. - * \param ctx The TVM context + * \param dev The TVM device * \param type The allocator type * \return The memory allocator. */ - static Allocator* GetOrCreateAllocator(TVMContext ctx, AllocatorType type); + static Allocator* GetOrCreateAllocator(Device dev, AllocatorType type); /*! * \brief Get an allocator given the context. - * \param ctx The TVM context + * \param dev The TVM device * \return The memory allocator. */ - static Allocator* GetAllocator(TVMContext ctx); + static Allocator* GetAllocator(Device dev); private: MemoryManager() {} private: std::mutex mu_; - std::unordered_map> allocators_; + std::unordered_map> allocators_; }; /*! \brief An object representing a storage allocation. */ @@ -140,7 +123,7 @@ class StorageObj : public Object { static void Deleter(Object* ptr); ~StorageObj() { - auto alloc = MemoryManager::Global()->GetAllocator(buffer.ctx); + auto alloc = MemoryManager::Global()->GetAllocator(buffer.device); alloc->Free(buffer); } diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h index e9f51de611b64..15de1df98a78e 100644 --- a/include/tvm/runtime/vm/vm.h +++ b/include/tvm/runtime/vm/vm.h @@ -239,17 +239,17 @@ class VirtualMachine : public runtime::ModuleNode { Index output_size, const std::vector& args); /*! - * \brief Initialize the virtual machine for a set of contexts. - * \param contexts The set of TVM contexts. - * \param alloc_types The allocator types for each context. + * \brief Initialize the virtual machine for a set of devices. + * \param devices The set of TVM devices. + * \param alloc_types The allocator types for each device. */ - void Init(const std::vector& contexts, const std::vector& alloc_types); + void Init(const std::vector& devices, const std::vector& alloc_types); /*! \brief Run VM dispatch loop. */ void RunLoop(); - /*! \brief Get context from the context list based on a given device type. */ - TVMContext GetContext(Index device_type) const; + /*! \brief Get device from the device list based on a given device type. */ + Device GetDevice(Index device_type) const; /*! * \brief Invoke a global setting up the VM state to execute. @@ -275,8 +275,8 @@ class VirtualMachine : public runtime::ModuleNode { const Executable* exec_; /*! \brief The function name to inputs mapping. */ std::unordered_map> inputs_; - /*! \brief The set of TVM contexts the VM is currently executing on. */ - std::vector ctxs_; + /*! \brief The set of TVM devices the VM is currently executing on. */ + std::vector devices_; /*! \brief The cached memory allocators. */ std::vector allocators_; /*! diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index 6445bb1fe73f7..84c27498740a1 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -1231,10 +1231,10 @@ constexpr const char* storage_scope = "storage_scope"; constexpr const char* storage_alignment = "storage_alignment"; /*! \brief Mark storage scope of realization */ constexpr const char* realize_scope = "realize_scope"; -/*! \brief The allocation context for global malloc in host. */ -constexpr const char* device_context_id = "device_context_id"; +/*! \brief The allocation device for global malloc in host. */ +constexpr const char* device_id = "device_id"; /*! \brief The device type. */ -constexpr const char* device_context_type = "device_context_type"; +constexpr const char* device_type = "device_type"; /*! \brief Mark of loop scope */ constexpr const char* loop_scope = "loop_scope"; /*! \brief Mark of reduce scope */ diff --git a/jvm/README.md b/jvm/README.md index e23c632fb04a0..6e71adf65683a 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -125,7 +125,7 @@ The following code snippet demonstrate how to load generated shared library (add ```java import org.apache.tvm.Module; import org.apache.tvm.NDArray; -import org.apache.tvm.TVMContext; +import org.apache.tvm.Device; import java.io.File; import java.util.Arrays; @@ -135,12 +135,12 @@ public class LoadAddFunc { String loadingDir = args[0]; Module fadd = Module.load(loadingDir + File.separator + "add_cpu.so"); - TVMContext ctx = TVMContext.cpu(); + Device dev = Device.cpu(); long[] shape = new long[]{2}; - NDArray arr = NDArray.empty(shape, ctx); + NDArray arr = NDArray.empty(shape, dev); arr.copyFrom(new float[]{3f, 4f}); - NDArray res = NDArray.empty(shape, ctx); + NDArray res = NDArray.empty(shape, dev); fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke(); System.out.println(Arrays.toString(res.asFloatArray())); diff --git a/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java b/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java index b3b3da56e72f1..ed6d0f1a0e124 100644 --- a/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java +++ b/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java @@ -20,7 +20,7 @@ // Type code used in API calls public enum ArgTypeCode { INT(0), UINT(1), FLOAT(2), HANDLE(3), NULL(4), TVM_TYPE(5), - TVM_CONTEXT(6), ARRAY_HANDLE(7), NODE_HANDLE(8), MODULE_HANDLE(9), + DLDEVICE(6), ARRAY_HANDLE(7), NODE_HANDLE(8), MODULE_HANDLE(9), FUNC_HANDLE(10), STR(11), BYTES(12), NDARRAY_CONTAINER(13); public final int id; diff --git a/jvm/core/src/main/java/org/apache/tvm/TVMContext.java b/jvm/core/src/main/java/org/apache/tvm/Device.java similarity index 75% rename from jvm/core/src/main/java/org/apache/tvm/TVMContext.java rename to jvm/core/src/main/java/org/apache/tvm/Device.java index 76375636a6cab..3447c692b5ef5 100644 --- a/jvm/core/src/main/java/org/apache/tvm/TVMContext.java +++ b/jvm/core/src/main/java/org/apache/tvm/Device.java @@ -22,7 +22,7 @@ import java.util.HashMap; import java.util.Map; -public class TVMContext { +public class Device { private static final Map MASK2STR = new HashMap(); private static final Map STR2MASK = new HashMap(); @@ -49,103 +49,103 @@ public class TVMContext { /** * Construct a CPU device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext cpu(int devId) { - return new TVMContext(1, devId); + public static Device cpu(int devId) { + return new Device(1, devId); } - public static TVMContext cpu() { + public static Device cpu() { return cpu(0); } /** * Construct a GPU device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext gpu(int devId) { - return new TVMContext(2, devId); + public static Device gpu(int devId) { + return new Device(2, devId); } - public static TVMContext gpu() { + public static Device gpu() { return gpu(0); } /** * Construct a OpenCL device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext opencl(int devId) { - return new TVMContext(4, devId); + public static Device opencl(int devId) { + return new Device(4, devId); } - public static TVMContext opencl() { + public static Device opencl() { return opencl(0); } /** * Construct a Vulkan device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext vulkan(int devId) { - return new TVMContext(7, devId); + public static Device vulkan(int devId) { + return new Device(7, devId); } - public static TVMContext vulkan() { + public static Device vulkan() { return vulkan(0); } /** * Construct a metal device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext metal(int devId) { - return new TVMContext(8, devId); + public static Device metal(int devId) { + return new Device(8, devId); } - public static TVMContext metal() { + public static Device metal() { return metal(0); } /** * Construct a VPI simulated device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext vpi(int devId) { - return new TVMContext(9, devId); + public static Device vpi(int devId) { + return new Device(9, devId); } - public static TVMContext vpi() { + public static Device vpi() { return vpi(0); } /** * Construct a Hexagon device. * @param devId The device id - * @return The created context + * @return The created device */ - public static TVMContext hexagon(int devId) { - return new TVMContext(14, devId); + public static Device hexagon(int devId) { + return new Device(14, devId); } - public static TVMContext hexagon() { + public static Device hexagon() { return hexagon(0); } public final int deviceType; public final int deviceId; - public TVMContext(int deviceType, int deviceId) { + public Device(int deviceType, int deviceId) { this.deviceType = deviceType; this.deviceId = deviceId; } - public TVMContext(String deviceType, int deviceId) { + public Device(String deviceType, int deviceId) { this(STR2MASK.get(deviceType), deviceId); } @@ -180,7 +180,7 @@ public long warpSize() { } /** - * Synchronize until jobs finished at the context. + * Synchronize until jobs finished at the device. */ public void sync() { Base.checkCall(Base._LIB.tvmSynchronize(deviceType, deviceId)); @@ -191,8 +191,8 @@ public void sync() { } @Override public boolean equals(Object other) { - if (other != null && other instanceof TVMContext) { - TVMContext obj = (TVMContext) other; + if (other != null && other instanceof Device) { + Device obj = (Device) other; return deviceId == obj.deviceId && deviceType == obj.deviceType; } return false; diff --git a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java index 4c5e0a66bf87f..62b8c901bd719 100644 --- a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java +++ b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java @@ -71,6 +71,6 @@ native int tvmArrayAlloc(long[] shape, int dtypeCode, int dtypeBits, int dtypeLa native int tvmArrayCopyToJArray(long from, byte[] to); - // TVMContext + // Device native int tvmSynchronize(int deviceType, int deviceId); } diff --git a/jvm/core/src/main/java/org/apache/tvm/NDArray.java b/jvm/core/src/main/java/org/apache/tvm/NDArray.java index 4e7386b123f85..a301d23dfbfa1 100644 --- a/jvm/core/src/main/java/org/apache/tvm/NDArray.java +++ b/jvm/core/src/main/java/org/apache/tvm/NDArray.java @@ -27,12 +27,12 @@ */ public class NDArray extends NDArrayBase { private final TVMType dtype; - private final TVMContext context; + private final Device device; - NDArray(long handle, boolean isView, TVMType dtype, TVMContext ctx) { + NDArray(long handle, boolean isView, TVMType dtype, Device dev) { super(handle, isView); this.dtype = dtype; - this.context = ctx; + this.device = dev; } @Override protected void finalize() throws Throwable { @@ -364,26 +364,26 @@ private byte[][] groupInternalBytes() { } /** - * Get the context of current array. - * @return the context. + * Get the device of current array. + * @return the device. */ - public TVMContext ctx() { - return context; + public Device device() { + return device; } /** * Create an empty array given shape, type and device. * @param shape The shape of the array. * @param dtype The data type of the array. - * @param ctx The context of the array. + * @param dev The device of the array. * @return The array tvm supported. */ - public static NDArray empty(long[] shape, TVMType dtype, TVMContext ctx) { + public static NDArray empty(long[] shape, TVMType dtype, Device dev) { Base.RefLong refHandle = new Base.RefLong(); Base.checkCall(Base._LIB.tvmArrayAlloc( shape, dtype.typeCode, dtype.bits, dtype.lanes, - ctx.deviceType, ctx.deviceId, refHandle)); - return new NDArray(refHandle.value, false, dtype, ctx); + dev.deviceType, dev.deviceId, refHandle)); + return new NDArray(refHandle.value, false, dtype, dev); } /** @@ -393,7 +393,7 @@ public static NDArray empty(long[] shape, TVMType dtype, TVMContext ctx) { * @return The array tvm supported. */ public static NDArray empty(long[] shape, TVMType dtype) { - return empty(shape, dtype, new TVMContext(1, 0)); + return empty(shape, dtype, new Device(1, 0)); } /** @@ -402,17 +402,17 @@ public static NDArray empty(long[] shape, TVMType dtype) { * @return The array tvm supported. */ public static NDArray empty(long[] shape) { - return empty(shape, new TVMType("float32", 1), new TVMContext(1, 0)); + return empty(shape, new TVMType("float32", 1), new Device(1, 0)); } /** * Create an empty float32 array given shape and device. * @param shape The shape of the array. - * @param ctx The context of the array. + * @param dev The device of the array. * @return The array tvm supported. */ - public static NDArray empty(long[] shape, TVMContext ctx) { - return empty(shape, new TVMType("float32", 1), ctx); + public static NDArray empty(long[] shape, Device dev) { + return empty(shape, new TVMType("float32", 1), dev); } private static ByteBuffer wrapBytes(byte[] bytes) { diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java index 64f089fb4eb13..0e2583553ed3f 100644 --- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java +++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java @@ -19,10 +19,10 @@ package org.apache.tvm.contrib; +import org.apache.tvm.Device; import org.apache.tvm.Function; import org.apache.tvm.Module; import org.apache.tvm.NDArray; -import org.apache.tvm.TVMContext; /** * Wrapper runtime module. @@ -32,7 +32,7 @@ */ public class GraphModule { private Module module; - private TVMContext ctx; + private Device device; private Function fsetInput; private Function frun; @@ -41,9 +41,9 @@ public class GraphModule { private Function fdebugGetOutput; private Function floadParams; - GraphModule(Module module, TVMContext ctx) { + GraphModule(Module module, Device dev) { this.module = module; - this.ctx = ctx; + this.device = dev; fsetInput = module.getFunction("set_input"); frun = module.getFunction("run"); fgetInput = module.getFunction("get_input"); @@ -82,8 +82,8 @@ public void release() { */ public GraphModule setInput(String key, NDArray value) { NDArray input = value; - if (!value.ctx().equals(ctx)) { - input = NDArray.empty(value.shape(), ctx); + if (!value.device().equals(device)) { + input = NDArray.empty(value.shape(), device); value.copyTo(input); } fsetInput.pushArg(key).pushArg(input).invoke(); @@ -98,8 +98,8 @@ public GraphModule setInput(String key, NDArray value) { */ public GraphModule setInput(int key, NDArray value) { NDArray input = value; - if (!value.ctx().equals(ctx)) { - input = NDArray.empty(value.shape(), ctx); + if (!value.device().equals(device)) { + input = NDArray.empty(value.shape(), device); value.copyTo(input); } fsetInput.pushArg(key).pushArg(input).invoke(); diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java index 61ff966eaf380..5bbd2beb4644d 100644 --- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java +++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java @@ -17,13 +17,13 @@ package org.apache.tvm.contrib; +import org.apache.tvm.Device; import org.apache.tvm.Function; import org.apache.tvm.Module; -import org.apache.tvm.TVMContext; import org.apache.tvm.TVMValue; import org.apache.tvm.rpc.RPC; import org.apache.tvm.rpc.RPCSession; -import org.apache.tvm.rpc.TVMRemoteContext; +import org.apache.tvm.rpc.TVMRemoteDevice; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; @@ -34,20 +34,20 @@ public class GraphRuntime { * Create a runtime executor module given a graph and module. * @param graphJson The graph deployed in json format output by compiler. * @param libmod The module of the corresponding function. - * @param ctx The local or remote context to deploy the module. + * @param dev The local or remote device to deploy the module. * @return Runtime graph module that can be used to execute the graph. */ - public static GraphModule create(String graphJson, Module libmod, TVMContext ctx) { + public static GraphModule create(String graphJson, Module libmod, Device dev) { Function fcreate = Function.getFunction("tvm.graph_runtime.create"); if (fcreate == null) { throw new RuntimeException("Cannot find global function tvm.graph_runtime.create." + "Did you compile tvm_runtime with correct version?"); } Module graphModule = fcreate.pushArg(graphJson) - .pushArg(libmod).pushArg(ctx.deviceType).pushArg(ctx.deviceId) + .pushArg(libmod).pushArg(dev.deviceType).pushArg(dev.deviceId) .invoke().asModule(); - return new GraphModule(graphModule, ctx); + return new GraphModule(graphModule, dev); } private static Object reflectionGetField(Object obj, String fieldName) { diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java index b9f621473cf4d..1d3f38627926b 100644 --- a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java +++ b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java @@ -17,9 +17,9 @@ package org.apache.tvm.rpc; +import org.apache.tvm.Device; import org.apache.tvm.Function; import org.apache.tvm.Module; -import org.apache.tvm.TVMContext; import java.io.File; import java.io.FileInputStream; @@ -52,111 +52,111 @@ public Function getFunction(String name) { } /** - * Construct a remote context. + * Construct a remote device. * @param devType device type. * @param devId device id. - * @return The corresponding encoded remote context. + * @return The corresponding encoded remote device. */ - public TVMContext context(String devType, int devId) { - TVMContext ctx = new TVMContext(devType, devId); + public Device device(String devType, int devId) { + Device dev = new Device(devType, devId); int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK; - return new TVMRemoteContext(ctx.deviceType + encode, devId, this); + return new TVMRemoteDevice(dev.deviceType + encode, devId, this); } /** - * Construct a remote context. + * Construct a remote device. * @param devType device type. - * @return The corresponding encoded remote context. + * @return The corresponding encoded remote device. */ - public TVMContext context(String devType) { - return context(devType, 0); + public Device device(String devType) { + return device(devType, 0); } /** - * Construct a remote context. + * Construct a remote device. * @param devType device type. * @param devId device id. - * @return The corresponding encoded remote context. + * @return The corresponding encoded remote device. */ - public TVMContext context(int devType, int devId) { + public Device device(int devType, int devId) { int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK; - return new TVMRemoteContext(devType + encode, devId, this); + return new TVMRemoteDevice(devType + encode, devId, this); } /** - * Construct a remote context. + * Construct a remote device. * @param devType device type. - * @return The corresponding encoded remote context. + * @return The corresponding encoded remote device. */ - public TVMContext context(int devType) { - return context(devType, 0); + public Device device(int devType) { + return device(devType, 0); } /** * Construct remote CPU device. * @param devId device id. - * @return Remote CPU context. + * @return Remote CPU device. */ - public TVMContext cpu(int devId) { - return context(1, devId); + public Device cpu(int devId) { + return device(1, devId); } /** * Construct remote CPU device. - * @return Remote CPU context. + * @return Remote CPU device. */ - public TVMContext cpu() { + public Device cpu() { return cpu(0); } /** * Construct remote GPU device. * @param devId device id. - * @return Remote GPU context. + * @return Remote GPU device. */ - public TVMContext gpu(int devId) { - return context(2, devId); + public Device gpu(int devId) { + return device(2, devId); } /** * Construct remote GPU device. - * @return Remote GPU context. + * @return Remote GPU device. */ - public TVMContext gpu() { + public Device gpu() { return gpu(0); } /** * Construct remote OpenCL device. * @param devId device id. - * @return Remote OpenCL context. + * @return Remote OpenCL device. */ - public TVMContext cl(int devId) { - return context(4, devId); + public Device cl(int devId) { + return device(4, devId); } /** * Construct remote OpenCL device. - * @return Remote OpenCL context. + * @return Remote OpenCL device. */ - public TVMContext cl() { + public Device cl() { return cl(0); } /** * Construct remote OpenCL device. * @param devId device id. - * @return Remote OpenCL context. + * @return Remote OpenCL device. */ - public TVMContext vulkan(int devId) { - return context(7, devId); + public Device vulkan(int devId) { + return device(7, devId); } /** * Construct remote OpenCL device. - * @return Remote OpenCL context. + * @return Remote OpenCL device. */ - public TVMContext vulkan() { + public Device vulkan() { return vulkan(0); } @@ -164,17 +164,17 @@ public TVMContext vulkan() { /** * Construct remote Metal device. * @param devId device id. - * @return Remote metal context. + * @return Remote metal device. */ - public TVMContext metal(int devId) { - return context(8, devId); + public Device metal(int devId) { + return device(8, devId); } /** * Construct remote Metal device. - * @return Remote metal context. + * @return Remote metal device. */ - public TVMContext metal() { + public Device metal() { return metal(0); } diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java b/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java similarity index 86% rename from jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java rename to jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java index fad14eceb16b1..6f70fa9a29f6d 100644 --- a/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java +++ b/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java @@ -17,13 +17,13 @@ package org.apache.tvm.rpc; -import org.apache.tvm.TVMContext; +import org.apache.tvm.Device; // always related to RPCSession. Cannot construct by users. -public class TVMRemoteContext extends TVMContext { +public class TVMRemoteDevice extends Device { public final RPCSession rpcSession; - TVMRemoteContext(int deviceType, int deviceId, RPCSession rpcSession) { + TVMRemoteDevice(int deviceType, int deviceId, RPCSession rpcSession) { super(deviceType, deviceId); this.rpcSession = rpcSession; } diff --git a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java index d675ee4b92025..666cbac6afeef 100644 --- a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java +++ b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java @@ -40,13 +40,13 @@ public static void beforeClass() { public void test_load_add_func_cpu() { Module fadd = Module.load(loadingDir + File.separator + "add_cpu.so"); - TVMContext ctx = new TVMContext("cpu", 0); + Device dev = new Device("cpu", 0); long[] shape = new long[]{2}; - NDArray arr = NDArray.empty(shape, ctx); + NDArray arr = NDArray.empty(shape, dev); arr.copyFrom(new float[]{3f, 4f}); - NDArray res = NDArray.empty(shape, ctx); + NDArray res = NDArray.empty(shape, dev); fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke(); assertArrayEquals(new float[]{6f, 8f}, res.asFloatArray(), 1e-3f); @@ -64,8 +64,8 @@ public void test_load_add_func_cpu() { public void test_load_add_func_gpu() { final Random RND = new Random(0); - TVMContext ctx = new TVMContext("gpu", 0); - if (!ctx.exist()) { + Device dev = new Device("gpu", 0); + if (!dev.exist()) { logger.warn("GPU does not exist. Skip the test."); return; } @@ -76,7 +76,7 @@ public void test_load_add_func_gpu() { final int dim = 100; long[] shape = new long[]{dim}; - NDArray arr = NDArray.empty(shape, ctx); + NDArray arr = NDArray.empty(shape, dev); float[] data = new float[dim]; float[] dataX2 = new float[dim]; @@ -86,7 +86,7 @@ public void test_load_add_func_gpu() { } arr.copyFrom(data); - NDArray res = NDArray.empty(shape, ctx); + NDArray res = NDArray.empty(shape, dev); fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke(); assertArrayEquals(dataX2, res.asFloatArray(), 1e-3f); diff --git a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java index 82096c1aa5dc5..d1760454a468b 100644 --- a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java +++ b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java @@ -19,7 +19,7 @@ import org.apache.tvm.Module; import org.apache.tvm.NDArray; -import org.apache.tvm.TVMContext; +import org.apache.tvm.Device; import org.apache.tvm.TestUtils; import org.apache.tvm.rpc.Client; import org.apache.tvm.rpc.RPCSession; @@ -51,14 +51,14 @@ public void test_add_one_local() throws IOException { loadingDir + File.separator + "graph_addone.json")) .useDelimiter("\\Z").next(); - TVMContext ctx = TVMContext.cpu(); - GraphModule graph = GraphRuntime.create(graphJson, libmod, ctx); + Device dev = Device.cpu(); + GraphModule graph = GraphRuntime.create(graphJson, libmod, dev); long[] shape = new long[]{4}; - NDArray arr = NDArray.empty(shape, ctx); + NDArray arr = NDArray.empty(shape, dev); arr.copyFrom(new float[]{1f, 2f, 3f, 4f}); - NDArray out = NDArray.empty(shape, ctx); + NDArray out = NDArray.empty(shape, dev); graph.setInput("x", arr).run(); graph.getOutput(0, out); @@ -87,18 +87,18 @@ public void test_add_one_remote() throws IOException { try { server = TestUtils.startServer(port); RPCSession remote = Client.connect("localhost", port.value); - TVMContext ctx = remote.cpu(); + Device dev = remote.cpu(); remote.upload(new File(libPath)); Module mlib = remote.loadModule("graph_addone_lib.so"); - GraphModule graph = GraphRuntime.create(graphJson, mlib, ctx); + GraphModule graph = GraphRuntime.create(graphJson, mlib, dev); long[] shape = new long[]{4}; - NDArray arr = NDArray.empty(shape, ctx); + NDArray arr = NDArray.empty(shape, dev); arr.copyFrom(new float[]{1f, 2f, 3f, 4f}); - NDArray out = NDArray.empty(shape, ctx); + NDArray out = NDArray.empty(shape, dev); graph.setInput("x", arr).run(); graph.getOutput(0, out); diff --git a/jvm/native/src/main/native/jni_helper_func.h b/jvm/native/src/main/native/jni_helper_func.h index 0f202004f99d8..82165e9e04b1a 100644 --- a/jvm/native/src/main/native/jni_helper_func.h +++ b/jvm/native/src/main/native/jni_helper_func.h @@ -175,13 +175,13 @@ void fromJavaDType(JNIEnv* env, jobject jdtype, DLDataType* dtype) { env->DeleteLocalRef(tvmTypeClass); } -void fromJavaContext(JNIEnv* env, jobject jctx, TVMContext* ctx) { - jclass tvmContextClass = env->FindClass("org/apache/tvm/TVMContext"); - ctx->device_type = static_cast( - env->GetIntField(jctx, env->GetFieldID(tvmContextClass, "deviceType", "I"))); - ctx->device_id = - static_cast(env->GetIntField(jctx, env->GetFieldID(tvmContextClass, "deviceId", "I"))); - env->DeleteLocalRef(tvmContextClass); +void fromJavaDevice(JNIEnv* env, jobject jdev, DLDevice* dev) { + jclass deviceClass = env->FindClass("org/apache/tvm/Device"); + dev->device_type = static_cast( + env->GetIntField(jdev, env->GetFieldID(deviceClass, "deviceType", "I"))); + dev->device_id = + static_cast(env->GetIntField(jdev, env->GetFieldID(deviceClass, "deviceId", "I"))); + env->DeleteLocalRef(deviceClass); } jobject tvmRetValueToJava(JNIEnv* env, TVMValue value, int tcode) { diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc index e3ea4b9c37667..ada714c2ec48c 100644 --- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc +++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc @@ -453,7 +453,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmArrayCopyToJArray(JNIEnv* return ret; } -// Context +// Device JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmSynchronize(JNIEnv* env, jint deviceType, jint deviceId) { return TVMSynchronize(static_cast(deviceType), static_cast(deviceId), NULL); diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 7a5f553ccdd50..4643062ea8e81 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -30,7 +30,7 @@ # top-level alias # tvm.runtime from .runtime.object import Object -from .runtime.ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl +from .runtime.ndarray import device, cpu, gpu, opencl, cl, vulkan, metal, mtl from .runtime.ndarray import vpi, rocm, ext_dev, micro_dev, hexagon from .runtime import ndarray as nd diff --git a/python/tvm/_ffi/_ctypes/packed_func.py b/python/tvm/_ffi/_ctypes/packed_func.py index fd82b263e2ddf..6cfa3e5c286a6 100644 --- a/python/tvm/_ffi/_ctypes/packed_func.py +++ b/python/tvm/_ffi/_ctypes/packed_func.py @@ -23,12 +23,12 @@ from ..base import _LIB, get_last_ffi_error, py2cerror, check_call from ..base import c_str, string_types -from ..runtime_ctypes import DataType, TVMByteArray, TVMContext, ObjectRValueRef +from ..runtime_ctypes import DataType, TVMByteArray, Device, ObjectRValueRef from . import ndarray as _nd from .ndarray import NDArrayBase, _make_array from .types import TVMValue, ArgTypeCode from .types import TVMPackedCFunc, TVMCFuncFinalizer -from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64 +from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _device_to_int64 from .object import ObjectBase, PyNativeObject, _set_class_object from . import object as _object @@ -141,9 +141,9 @@ def _make_tvm_args(args, temp_args): elif isinstance(arg, DataType): values[i].v_str = c_str(str(arg)) type_codes[i] = ArgTypeCode.STR - elif isinstance(arg, TVMContext): - values[i].v_int64 = _ctx_to_int64(arg) - type_codes[i] = ArgTypeCode.TVM_CONTEXT + elif isinstance(arg, Device): + values[i].v_int64 = _device_to_int64(arg) + type_codes[i] = ArgTypeCode.DLDEVICE elif isinstance(arg, (bytearray, bytes)): # from_buffer only taeks in bytearray. if isinstance(arg, bytes): diff --git a/python/tvm/_ffi/_ctypes/types.py b/python/tvm/_ffi/_ctypes/types.py index 4b6d66957b253..38d3cd72b55dc 100644 --- a/python/tvm/_ffi/_ctypes/types.py +++ b/python/tvm/_ffi/_ctypes/types.py @@ -19,7 +19,7 @@ import ctypes import struct from ..base import py_str, check_call, _LIB -from ..runtime_ctypes import TVMByteArray, ArgTypeCode, TVMContext +from ..runtime_ctypes import TVMByteArray, ArgTypeCode, Device class TVMValue(ctypes.Union): @@ -68,13 +68,13 @@ def _return_bytes(x): return res -def _return_context(value): - """return TVMContext""" +def _return_device(value): + """return Device""" # use bit unpacking from int64 view # We use this to get around ctypes issue on Union of Structure data = struct.pack("=q", value.v_int64) arr = struct.unpack("=ii", data) - return TVMContext(arr[0], arr[1]) + return Device(arr[0], arr[1]) def _wrap_arg_func(return_f, type_code): @@ -86,9 +86,9 @@ def _wrap_func(x): return _wrap_func -def _ctx_to_int64(ctx): +def _device_to_int64(dev): """Pack context into int64 in native endian""" - data = struct.pack("=ii", ctx.device_type, ctx.device_id) + data = struct.pack("=ii", dev.device_type, dev.device_id) return struct.unpack("=q", data)[0] @@ -99,7 +99,7 @@ def _ctx_to_int64(ctx): ArgTypeCode.NULL: lambda x: None, ArgTypeCode.STR: lambda x: py_str(x.v_str), ArgTypeCode.BYTES: _return_bytes, - ArgTypeCode.TVM_CONTEXT: _return_context, + ArgTypeCode.DLDEVICE: _return_device, } C_TO_PY_ARG_SWITCH = { @@ -109,5 +109,5 @@ def _ctx_to_int64(ctx): ArgTypeCode.NULL: lambda x: None, ArgTypeCode.STR: lambda x: py_str(x.v_str), ArgTypeCode.BYTES: _return_bytes, - ArgTypeCode.TVM_CONTEXT: _return_context, + ArgTypeCode.DLDEVICE: _return_device, } diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi index 8c9e413813b9b..bf4d6b933a4f9 100644 --- a/python/tvm/_ffi/_cython/base.pxi +++ b/python/tvm/_ffi/_cython/base.pxi @@ -29,7 +29,7 @@ cdef enum TVMArgTypeCode: kTVMOpaqueHandle = 3 kTVMNullptr = 4 kTVMDataType = 5 - kTVMContext = 6 + kDLDevice = 6 kTVMDLTensorHandle = 7 kTVMObjectHandle = 8 kTVMModuleHandle = 9 @@ -46,13 +46,13 @@ cdef extern from "tvm/runtime/c_runtime_api.h": uint8_t bits uint16_t lanes - ctypedef struct DLContext: + ctypedef struct DLDevice: int device_type int device_id ctypedef struct DLTensor: void* data - DLContext ctx + DLDevice device int ndim DLDataType dtype int64_t* shape @@ -70,7 +70,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h": void* v_handle const char* v_str DLDataType v_type - DLContext v_ctx + DLDevice v_device ctypedef int64_t tvm_index_t ctypedef DLTensor* DLTensorHandle @@ -118,7 +118,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h": int TVMArrayAlloc(tvm_index_t* shape, tvm_index_t ndim, DLDataType dtype, - DLContext ctx, + DLDevice dev, DLTensorHandle* out) int TVMArrayFree(DLTensorHandle handle) int TVMArrayCopyFromTo(DLTensorHandle src, diff --git a/python/tvm/_ffi/_cython/packed_func.pxi b/python/tvm/_ffi/_cython/packed_func.pxi index 00585659ab76f..30b879de80ed7 100644 --- a/python/tvm/_ffi/_cython/packed_func.pxi +++ b/python/tvm/_ffi/_cython/packed_func.pxi @@ -20,7 +20,7 @@ import traceback from cpython cimport Py_INCREF, Py_DECREF from numbers import Number, Integral from ..base import string_types, py2cerror -from ..runtime_ctypes import DataType, TVMContext, TVMByteArray, ObjectRValueRef +from ..runtime_ctypes import DataType, Device, TVMByteArray, ObjectRValueRef cdef void tvm_callback_finalize(void* fhandle) with gil: @@ -139,10 +139,10 @@ cdef inline int make_arg(object arg, value[0].v_str = tstr tcode[0] = kTVMStr temp_args.append(tstr) - elif isinstance(arg, TVMContext): - value[0].v_ctx = (( + elif isinstance(arg, Device): + value[0].v_device = (( ctypes.addressof(arg)))[0] - tcode[0] = kTVMContext + tcode[0] = kDLDevice elif isinstance(arg, (bytes, bytearray)): # from_buffer only taeks in bytearray. if isinstance(arg, bytes): @@ -220,8 +220,8 @@ cdef inline object make_ret(TVMValue value, int tcode): return make_ret_bytes(value.v_handle) elif tcode == kTVMOpaqueHandle: return ctypes_handle(value.v_handle) - elif tcode == kTVMContext: - return TVMContext(value.v_ctx.device_type, value.v_ctx.device_id) + elif tcode == kDLDevice: + return Device(value.v_device.device_type, value.v_device.device_id) elif tcode == kTVMModuleHandle: return _CLASS_MODULE(ctypes_handle(value.v_handle)) elif tcode == kTVMPackedFuncHandle: diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 3a874ebb1208e..59dc652aeb0bc 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -33,7 +33,7 @@ class ArgTypeCode(object): HANDLE = 3 NULL = 4 TVM_TYPE = 5 - TVM_CONTEXT = 6 + DLDEVICE = 6 DLTENSOR_HANDLE = 7 OBJECT_HANDLE = 8 MODULE_HANDLE = 9 @@ -149,8 +149,8 @@ def __ne__(self, other): RPC_SESS_MASK = 128 -class TVMContext(ctypes.Structure): - """TVM context strucure.""" +class Device(ctypes.Structure): + """TVM device strucure.""" _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)] MASK2STR = { @@ -192,7 +192,7 @@ class TVMContext(ctypes.Structure): } def __init__(self, device_type, device_id): - super(TVMContext, self).__init__() + super(Device, self).__init__() self.device_type = int(device_type) self.device_id = device_id @@ -268,7 +268,7 @@ def sync(self): def __eq__(self, other): return ( - isinstance(other, TVMContext) + isinstance(other, Device) and self.device_id == other.device_id and self.device_type == other.device_type ) @@ -283,8 +283,8 @@ def __repr__(self): if self.device_type >= RPC_SESS_MASK: tbl_id = self.device_type / RPC_SESS_MASK - 1 dev_type = self.device_type % RPC_SESS_MASK - return "remote[%d]:%s(%d)" % (tbl_id, TVMContext.MASK2STR[dev_type], self.device_id) - return "%s(%d)" % (TVMContext.MASK2STR[self.device_type], self.device_id) + return "remote[%d]:%s(%d)" % (tbl_id, Device.MASK2STR[dev_type], self.device_id) + return "%s(%d)" % (Device.MASK2STR[self.device_type], self.device_id) class TVMArray(ctypes.Structure): @@ -292,7 +292,7 @@ class TVMArray(ctypes.Structure): _fields_ = [ ("data", ctypes.c_void_p), - ("ctx", TVMContext), + ("device", Device), ("ndim", ctypes.c_int), ("dtype", DataType), ("shape", ctypes.POINTER(tvm_shape_index_t)), diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index d02dcff3bba09..039914bea7d33 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -544,9 +544,9 @@ def __init__( from tvm.rpc.tracker import Tracker from tvm.rpc.server import Server - ctx = tvm.context("cuda", 0) - if ctx.exist: - cuda_arch = "sm_" + "".join(ctx.compute_version.split(".")) + dev = tvm.device("cuda", 0) + if dev.exist: + cuda_arch = "sm_" + "".join(dev.compute_version.split(".")) set_cuda_target_arch(cuda_arch) host = "0.0.0.0" self.tracker = Tracker(host, port=9000, port_end=10000, silent=True) @@ -835,7 +835,7 @@ def _timed_eval_func( error_msg = None try: func = module.load_module(build_res.filename) - ctx = ndarray.context(str(inp.task.target), 0) + dev = ndarray.device(str(inp.task.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold @@ -844,7 +844,7 @@ def _timed_eval_func( f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = func.time_evaluator( func.entry_name, - ctx, + dev, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, @@ -870,7 +870,7 @@ def _timed_eval_func( if tensor_name in task_input_names: args.append( ndarray.array( - get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + get_task_input_buffer(inp.task.workload_key, tensor_name), dev ) ) task_inputs_count += 1 @@ -880,14 +880,14 @@ def _timed_eval_func( + "should provide with `SearchTask(..., task_inputs={...})`" ) else: - empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx) + empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, dev) random_fill(empty_array) args.append(empty_array) if task_inputs_count != len(task_input_names): logger.warning( "task_inputs not fully matched, check if there's any unexpected error" ) - ctx.sync() + dev.sync() costs = time_f(*args).results # pylint: disable=broad-except except Exception: @@ -1048,7 +1048,7 @@ def _timed_rpc_run( remote = request_remote(key, host, port, priority, timeout) remote.upload(build_res.filename) func = remote.load_module(os.path.split(build_res.filename)[1]) - ctx = remote.context(str(inp.task.target), 0) + dev = remote.device(str(inp.task.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped # under the std::function. We could lift the restriction later once we fold @@ -1057,7 +1057,7 @@ def _timed_rpc_run( f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = func.time_evaluator( func.entry_name, - ctx, + dev, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, @@ -1085,7 +1085,7 @@ def _timed_rpc_run( if tensor_name in task_input_names: args.append( ndarray.array( - get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + get_task_input_buffer(inp.task.workload_key, tensor_name), dev ) ) task_inputs_count += 1 @@ -1095,14 +1095,14 @@ def _timed_rpc_run( + "should provide with `SearchTask(..., task_inputs={...})`" ) else: - empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx) + empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, dev) random_fill(empty_array) args.append(empty_array) if task_inputs_count != len(task_input_names): logger.warning( "task_inputs not fully matched, check if there's any unexpected error" ) - ctx.sync() + dev.sync() costs = time_f(*args).results # clean up remote files diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index b68767bd05280..261ecabe49cda 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -261,18 +261,18 @@ def get_build_kwargs(self): or "vulkan" in self.task.target.keys ): remote = request_remote(self.key, self.host, self.port) - ctx = remote.context(str(self.task.target), 0) - max_dims = ctx.max_thread_dimensions + dev = remote.device(str(self.task.target), 0) + max_dims = dev.max_thread_dimensions kwargs["check_gpu"] = { - "max_shared_memory_per_block": ctx.max_shared_memory_per_block, - "max_threads_per_block": ctx.max_threads_per_block, + "max_shared_memory_per_block": dev.max_shared_memory_per_block, + "max_threads_per_block": dev.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2], } if "cuda" in self.task.target.keys: - kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split(".")) + kwargs["cuda_arch"] = "sm_" + "".join(dev.compute_version.split(".")) if self.task.target.device_name == "micro_dev": kwargs.setdefault("build_option", {})["tir.disable_vectorize"] = True @@ -555,7 +555,7 @@ def run_through_rpc( try: # upload built module with module_loader(remote_kwargs, build_result) as (remote, mod): - ctx = remote.context(str(measure_input.target), 0) + dev = remote.device(str(measure_input.target), 0) # Limitation: # We can not get PackFunction directly in the remote mode as it is wrapped @@ -565,7 +565,7 @@ def run_through_rpc( f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" time_f = mod.time_evaluator( mod.entry_name, - ctx, + dev, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, @@ -578,12 +578,12 @@ def run_through_rpc( raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) - args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info] + args = [nd.array(np.zeros(x[0], dtype=x[1]), device=dev) for x in build_result.arg_info] if "scatter" not in measure_input.task.name: # the index tensor of scatter op cannot be randomly initialized for arg in args: random_fill(arg) - ctx.sync() + dev.sync() costs = time_f(*args).results @@ -698,8 +698,8 @@ def check_remote(target, device_key, host=None, port=None, priority=100, timeout def _check(): remote = request_remote(device_key, host, port, priority) - ctx = remote.context(str(target)) - while not ctx.exist: # wait until we get an available device + dev = remote.device(str(target)) + while not dev.exist: # wait until we get an available device pass t = threading.Thread( diff --git a/python/tvm/contrib/coreml_runtime.py b/python/tvm/contrib/coreml_runtime.py index 4ef3593cf44b7..b2555572ed423 100644 --- a/python/tvm/contrib/coreml_runtime.py +++ b/python/tvm/contrib/coreml_runtime.py @@ -19,7 +19,7 @@ from ..rpc import base as rpc_base -def create(symbol, compiled_model_path, ctx): +def create(symbol, compiled_model_path, device): """Create a runtime executor module given a coreml model and context. Parameters ---------- @@ -27,19 +27,19 @@ def create(symbol, compiled_model_path, ctx): The symbol that represents the Core ML model. compiled_model_path : str The path of the compiled model to be deployed. - ctx : TVMContext - The context to deploy the module. It can be local or remote when there - is only one TVMContext. + device : Device + The device to deploy the module. It can be local or remote when there + is only one Device. Returns ------- coreml_runtime : CoreMLModule Runtime coreml module that can be used to execute the coreml model. """ - device_type = ctx.device_type + device_type = device.device_type runtime_func = "tvm.coreml_runtime.create" if device_type >= rpc_base.RPC_SESS_MASK: - fcreate = ctx._rpc_sess.get_function(runtime_func) + fcreate = device._rpc_sess.get_function(runtime_func) else: fcreate = tvm._ffi.get_global_func(runtime_func) diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py index 45ec89d37b3d8..c2f4455d881b9 100644 --- a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py +++ b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py @@ -21,7 +21,7 @@ from tvm.contrib import graph_runtime -def create(graph_json_str, libmod, ctx): +def create(graph_json_str, libmod, device): """Create a runtime executor module given a graph and module. Parameters @@ -34,8 +34,8 @@ def create(graph_json_str, libmod, ctx): libmod : tvm.runtime.Module The module of the corresponding function - ctx : TVMContext - The context to deploy the module, only supports CUDA GPU + device : Device + The device to deploy the module, only supports CUDA GPU Returns ------- @@ -50,9 +50,9 @@ def create(graph_json_str, libmod, ctx): """ assert isinstance(graph_json_str, string_types) try: - ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx) - if num_rpc_ctx == len(ctx): - fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create") + dev, num_rpc_dev, device_type_id = graph_runtime.get_device(libmod, device) + if num_rpc_dev == len(dev): + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create") else: fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cuda_graph.create") except ValueError: diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py index 289ac4c467e07..f0e299728fbc5 100644 --- a/python/tvm/contrib/debugger/debug_runtime.py +++ b/python/tvm/contrib/debugger/debug_runtime.py @@ -30,7 +30,7 @@ _DUMP_PATH_PREFIX = "_tvmdbg_" -def create(graph_json_str, libmod, ctx, dump_root=None): +def create(graph_json_str, libmod, device, dump_root=None): """Create a runtime executor module given a graph and module. Parameters @@ -43,8 +43,8 @@ def create(graph_json_str, libmod, ctx, dump_root=None): libmod : tvm.Module The module of the corresponding function. - ctx : TVMContext - The context to deploy the module, can be local or remote. + device : Device + The device to deploy the module, can be local or remote. dump_root : str To select which folder the outputs should be kept. @@ -57,9 +57,9 @@ def create(graph_json_str, libmod, ctx, dump_root=None): assert isinstance(graph_json_str, string_types) try: - ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx) - if num_rpc_ctx == len(ctx): - fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_debug.create") + dev, num_rpc_dev, device_type_id = graph_runtime.get_device(libmod, device) + if num_rpc_dev == len(dev): + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime_debug.create") else: fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_debug.create") except ValueError: @@ -68,7 +68,7 @@ def create(graph_json_str, libmod, ctx, dump_root=None): "config.cmake and rebuild TVM to enable debug mode" ) func_obj = fcreate(graph_json_str, libmod, *device_type_id) - return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root) + return GraphModuleDebug(func_obj, dev, graph_json_str, dump_root) class GraphModuleDebug(graph_runtime.GraphModule): @@ -84,8 +84,8 @@ class GraphModuleDebug(graph_runtime.GraphModule): module : Module The internal tvm module that holds the actual graph functions. - ctx : TVMContext - The context this module is under. + device : Device + The device that this module is under. graph_json_str : str or graph class Content of graph json file in string format @@ -95,16 +95,16 @@ class GraphModuleDebug(graph_runtime.GraphModule): None will make a temp folder in /tmp/tvmdbg and does the dumping """ - def __init__(self, module, ctx, graph_json_str, dump_root): + def __init__(self, module, device, graph_json_str, dump_root): self._dump_root = dump_root self._dump_path = None self._get_output_by_layer = module["get_output_by_layer"] self._run_individual = module["run_individual"] graph_runtime.GraphModule.__init__(self, module) - self._create_debug_env(graph_json_str, ctx) + self._create_debug_env(graph_json_str, device) - def _format_context(self, ctx): - return str(ctx[0]).upper().replace("(", ":").replace(")", "") + def _format_device(self, device): + return str(device[0]).upper().replace("(", ":").replace(")", "") def _ensure_dir(self, directory): """Create a directory if not exists @@ -118,13 +118,13 @@ def _ensure_dir(self, directory): if not os.path.exists(directory): os.makedirs(directory, 0o700) - def _get_dump_path(self, ctx): + def _get_dump_path(self, device): """Make the graph and tensor dump folder and return the path. Parameters ---------- - ctx : TVMContext - The context this module is under. + device : Device + The device that this module is under. Returns ------- @@ -132,8 +132,8 @@ def _get_dump_path(self, ctx): Directory path where the graph and node outputs will be stored. """ # save to file - folder_name = _DUMP_PATH_PREFIX + "ctx_" - folder_name = folder_name + ctx.replace(":", "_") + folder_name = _DUMP_PATH_PREFIX + "device_" + folder_name = folder_name + device.replace(":", "_") path = os.path.join(self._dump_root, folder_name) self._ensure_dir(path) return path @@ -142,7 +142,7 @@ def _remove_dump_root(self): if os.path.isdir(self._dump_root): shutil.rmtree(self._dump_root) - def _create_debug_env(self, graph_json, ctx): + def _create_debug_env(self, graph_json, device): """Create UI wrapper framework to handle multiple UI frontends for tvmdbg Parameters @@ -153,18 +153,18 @@ def _create_debug_env(self, graph_json, ctx): nodes_list : list List of all the nodes presented in the graph - ctx : TVMContext - The context this module is under. + device : Device + The device that this module is under. """ # make the dump folder if not given if not self._dump_root: self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX) - # format the context - ctx = self._format_context(ctx) + # format the device + device = self._format_device(device) # updates the dumping directories - self._dump_path = self._get_dump_path(ctx) + self._dump_path = self._get_dump_path(device) # init the debug dumping environment self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path) diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 59db716e917cb..2eea188be9779 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -21,10 +21,10 @@ from tvm.rpc import _ffi_api as _rpc_ffi_api from tvm.rpc import base as rpc_base from tvm._ffi.base import string_types -from tvm._ffi.runtime_ctypes import TVMContext +from tvm._ffi.runtime_ctypes import Device -def create(graph_json_str, libmod, ctx): +def create(graph_json_str, libmod, device): """Create a runtime executor module given a graph and module. Parameters @@ -37,10 +37,10 @@ def create(graph_json_str, libmod, ctx): libmod : tvm.runtime.Module The module of the corresponding function - ctx : TVMContext or list of TVMContext - The context to deploy the module. It can be local or remote when there - is only one TVMContext. Otherwise, the first context in the list will - be used as this purpose. All context should be given for heterogeneous + device : Device or list of Device + The device to deploy the module. It can be local or remote when there + is only one Device. Otherwise, the first device in the list will + be used as this purpose. All device should be given for heterogeneous execution. Returns @@ -56,59 +56,59 @@ def create(graph_json_str, libmod, ctx): """ assert isinstance(graph_json_str, string_types) - ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx) + dev, num_rpc_dev, device_type_id = get_device(libmod, device) - if num_rpc_ctx == len(ctx): - fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.create") + if num_rpc_dev == len(dev): + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime.create") else: fcreate = tvm._ffi.get_global_func("tvm.graph_runtime.create") return GraphModule(fcreate(graph_json_str, libmod, *device_type_id)) -def get_device_ctx(libmod, ctx): - """Parse and validate all the device context(s). +def get_device(libmod, device): + """Parse and validate all the device(s). Parameters ---------- libmod : tvm.runtime.Module The module of the corresponding function - ctx : TVMContext or list of TVMContext + device : Device or list of Device Returns ------- - ctx : list of TVMContext - num_rpc_ctx : Number of rpc contexts + device : list of Device + num_rpc_dev : Number of rpc devices device_type_id : List of device type and device id """ - if isinstance(ctx, TVMContext): - ctx = [ctx] - elif not isinstance(ctx, (list, tuple)): - raise ValueError("ctx has to be the type of TVMContext or a list of " "TVMContext") - for cur_ctx in ctx: - if not isinstance(cur_ctx, TVMContext): - raise ValueError("ctx has to be the type of TVMContext or a list " "of TVMContext") + if isinstance(device, Device): + device = [device] + elif not isinstance(device, (list, tuple)): + raise ValueError("dev has to be the type of Device or a list of Device") + for cur_dev in device: + if not isinstance(cur_dev, Device): + raise ValueError("dev has to be the type of Device or a list of Device") # device_type_id[0], device_type_id[1] are used as the primary/fallback - # context type and id. All other ones are used as device context for + # device type and id. All other ones are used as device for # heterogeneous execution. - num_rpc_ctx = 0 + num_rpc_dev = 0 device_type_id = [] - for cur_ctx in ctx: - device_type = cur_ctx.device_type + for cur_dev in device: + device_type = cur_dev.device_type if device_type >= rpc_base.RPC_SESS_MASK: assert libmod.type_key == "rpc" - assert _rpc_ffi_api.SessTableIndex(libmod) == cur_ctx._rpc_sess._tbl_index - num_rpc_ctx += 1 - device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK + assert _rpc_ffi_api.SessTableIndex(libmod) == cur_dev._rpc_sess._tbl_index + num_rpc_dev += 1 + device_type = cur_dev.device_type % rpc_base.RPC_SESS_MASK device_type_id.append(device_type) - device_type_id.append(cur_ctx.device_id) + device_type_id.append(cur_dev.device_id) - if 0 < num_rpc_ctx < len(ctx): - raise ValueError("Either all or none of the contexts should be rpc.") - return ctx, num_rpc_ctx, device_type_id + if 0 < num_rpc_dev < len(device): + raise ValueError("Either all or none of the devices should be rpc.") + return device, num_rpc_dev, device_type_id class GraphModule(object): @@ -144,7 +144,7 @@ class GraphModule(object): lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so") # Call the library factory function for default and create # a new runtime.Module, wrap with graph module. - gmod = graph_runtime.GraphModule(lib["default"](ctx)) + gmod = graph_runtime.GraphModule(lib["default"](dev)) # use the graph module. gmod.set_input("x", data) gmod.run() diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py index 62ee9fea400b9..833a505f6d425 100644 --- a/python/tvm/contrib/peak.py +++ b/python/tvm/contrib/peak.py @@ -45,7 +45,7 @@ def measure_bandwidth_sum( target, target_host, remote, - ctx, + dev, n_times, ): """measure memory bandwidth of gpu by product reduction for a given type @@ -74,8 +74,8 @@ def measure_bandwidth_sum( the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target - ctx: TVMcontext - the context of array + dev: Device + the device of array remote: tvm.rpc.RPCSession remote rpc session n_times: int @@ -109,11 +109,11 @@ def measure_bandwidth_sum( try: func = tvm.build(s, [x, y], target, target_host=target_host) - x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) - y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx) + x = tvm.nd.empty((n,), dtype=dtype, device=dev) + y = tvm.nd.empty((n // m,), dtype=dtype, device=dev) func = _convert_to_remote(func, remote) - time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) + time_f = func.time_evaluator(func.entry_name, dev, number=n_times) time = time_f(x, y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) @@ -123,7 +123,7 @@ def measure_bandwidth_sum( def measure_bandwidth_all_types( - total_item, item_per_thread, n_times, target, target_host, remote, ctx, verbose=True + total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True ): """measure memory bandwidth for all types @@ -141,8 +141,8 @@ def measure_bandwidth_all_types( host compilation target remote: tvm.rpc.RPCSession remote rpc session - ctx: TVMcontext - the context of array + dev: Device + the device of array verbose: bool whether outputs immediate result @@ -170,7 +170,7 @@ def measure_bandwidth_all_types( target, target_host, remote, - ctx, + dev, n_times, ) max_speed = max(max_speed, speed) @@ -182,7 +182,7 @@ def measure_bandwidth_all_types( def measure_compute_mad( - total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, ctx, n_times + total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, dev, n_times ): """measure peak compute speed by computing mad for a type @@ -211,8 +211,8 @@ def measure_compute_mad( host compilation target remote: tvm.rpc.RPCSession if it is not None, use remote rpc session - ctx: TVMcontext - the context of array + dev: Device + the device of array n_times: int number of runs for taking mean @@ -274,8 +274,8 @@ def mad_func(x, y): try: func = tvm.build(s, [y], target, target_host=target_host) func = _convert_to_remote(func, remote) - time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) - y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) + time_f = func.time_evaluator(func.entry_name, dev, number=n_times) + y = tvm.nd.empty((n,), dtype=dtype, device=dev) time = time_f(y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) @@ -285,7 +285,7 @@ def mad_func(x, y): def measure_compute_all_types( - total_item, item_per_thread, n_times, target, target_host, remote, ctx, verbose=True + total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True ): """measure peak flops for all types @@ -303,8 +303,8 @@ def measure_compute_all_types( host compilation target remote: tvm.rpc.RPCSession remote rpc session - ctx: TVMcontext - the context of array + dev: Device + the device of array verbose: bool whether outputs immediate result @@ -331,7 +331,7 @@ def measure_compute_all_types( target, target_host, remote, - ctx, + dev, n_times, ) max_speed = max(max_speed, speed) @@ -368,20 +368,20 @@ def measure_peak_all(target, target_host, host, port): compute_item_per_thread = 4096 if str(target).startswith("opencl"): - ctx = remote.cl() + dev = remote.cl() elif str(target).startswith("cuda"): - ctx = remote.gpu() + dev = remote.gpu() elif str(target).startswith("metal"): - ctx = remote.metal() + dev = remote.metal() else: raise RuntimeError("Unsupported target") logging.info("========== measure memory bandwidth ==========") measure_bandwidth_all_types( - bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, ctx + bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev ) logging.info("========== measure peak compute ==========") measure_compute_all_types( - compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, ctx + compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev ) diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py index c1263c43b476d..bee9b835a98a1 100644 --- a/python/tvm/contrib/sparse.py +++ b/python/tvm/contrib/sparse.py @@ -30,7 +30,7 @@ class CSRNDArray(object): """Sparse tensor object in CSR format.""" - def __init__(self, arg1, ctx=None, shape=None): + def __init__(self, arg1, device=None, shape=None): """Construct a sparse matrix in CSR format. Parameters @@ -39,8 +39,8 @@ def __init__(self, arg1, ctx=None, shape=None): The corresponding a dense numpy array, or a tuple for constructing a sparse matrix directly. - ctx: tvmContext - The corresponding context. + device: Device + The corresponding device. shape : tuple of int The shape of the array @@ -53,14 +53,14 @@ def __init__(self, arg1, ctx=None, shape=None): source_array = arg1 ridx, cidx = _np.nonzero(source_array) data = source_array[ridx, cidx] - self.data = _nd.array(data, ctx) + self.data = _nd.array(data, device) indices = _np.nonzero(source_array)[1].astype(itype) - self.indices = _nd.array(indices, ctx) + self.indices = _nd.array(indices, device) indptr = [0] + _np.apply_along_axis( _np.count_nonzero, axis=1, arr=source_array ).tolist() indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype) - self.indptr = _nd.array(indptr, ctx) + self.indptr = _nd.array(indptr, device) self.shape = source_array.shape else: raise RuntimeError( @@ -89,11 +89,11 @@ def asnumpy(self): return full -def array(source_array, ctx=None, shape=None, stype="csr"): +def array(source_array, device=None, shape=None, stype="csr"): """Construct a sparse NDArray from numpy.ndarray""" ret = None if stype == "csr": - ret = CSRNDArray(source_array, shape=shape, ctx=ctx) + ret = CSRNDArray(source_array, shape=shape, device=device) else: raise NotImplementedError("stype=%s is not supported yet." % (stype,)) return ret diff --git a/python/tvm/contrib/target/coreml.py b/python/tvm/contrib/target/coreml.py index 0f4bb6675611e..18a53bdffd864 100644 --- a/python/tvm/contrib/target/coreml.py +++ b/python/tvm/contrib/target/coreml.py @@ -244,5 +244,5 @@ def coreml_compiler(func): shutil.rmtree(mlmodelc_path) builder.compile(model_dir) - ctx = tvm.cpu(0) - return coreml_runtime.create(name, mlmodelc_path, ctx).module + dev = tvm.cpu(0) + return coreml_runtime.create(name, mlmodelc_path, dev).module diff --git a/python/tvm/contrib/tflite_runtime.py b/python/tvm/contrib/tflite_runtime.py index 3b0e268e2a443..1558e36d51af4 100644 --- a/python/tvm/contrib/tflite_runtime.py +++ b/python/tvm/contrib/tflite_runtime.py @@ -19,15 +19,15 @@ from ..rpc import base as rpc_base -def create(tflite_model_bytes, ctx, runtime_target="cpu"): - """Create a runtime executor module given a tflite model and context. +def create(tflite_model_bytes, device, runtime_target="cpu"): + """Create a runtime executor module given a tflite model and device. Parameters ---------- tflite_model_byte : bytes The tflite model to be deployed in bytes string format. - ctx : TVMContext - The context to deploy the module. It can be local or remote when there - is only one TVMContext. + device : Device + The device to deploy the module. It can be local or remote when there + is only one Device. runtime_target: str Execution target of TFLite runtime: either `cpu` or `edge_tpu`. Returns @@ -35,7 +35,7 @@ def create(tflite_model_bytes, ctx, runtime_target="cpu"): tflite_runtime : TFLiteModule Runtime tflite module that can be used to execute the tflite model. """ - device_type = ctx.device_type + device_type = device.device_type if runtime_target == "edge_tpu": runtime_func = "tvm.edgetpu_runtime.create" @@ -43,11 +43,11 @@ def create(tflite_model_bytes, ctx, runtime_target="cpu"): runtime_func = "tvm.tflite_runtime.create" if device_type >= rpc_base.RPC_SESS_MASK: - fcreate = ctx._rpc_sess.get_function(runtime_func) + fcreate = device._rpc_sess.get_function(runtime_func) else: fcreate = tvm._ffi.get_global_func(runtime_func) - return TFLiteModule(fcreate(bytearray(tflite_model_bytes), ctx)) + return TFLiteModule(fcreate(bytearray(tflite_model_bytes), device)) class TFLiteModule(object): diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py index 5eaecb422163a..79c9cef801c30 100644 --- a/python/tvm/driver/build_module.py +++ b/python/tvm/driver/build_module.py @@ -233,7 +233,7 @@ def _build_for_device(input_mod, target, target_host): """ target = Target(target) target_host = Target(target_host) - device_type = ndarray.context(target.kind.name, 0).device_type + device_type = ndarray.device(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod_mixed) @@ -402,7 +402,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi if not target_host: for tar, _ in target_input_mod.items(): tar = Target(tar) - device_type = ndarray.context(tar.kind.name, 0).device_type + device_type = ndarray.device(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py index 1d23ccfb0c007..252647557ab5d 100644 --- a/python/tvm/driver/tvmc/runner.py +++ b/python/tvm/driver/tvmc/runner.py @@ -361,19 +361,19 @@ def run_module( # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron) logger.debug("device is %s", device) if device == "gpu": - ctx = session.gpu() + dev = session.gpu() elif device == "cl": - ctx = session.cl() + dev = session.cl() else: assert device == "cpu" - ctx = session.cpu() + dev = session.cpu() if profile: logger.debug("creating runtime with profiling enabled") - module = debug_runtime.create(graph, lib, ctx, dump_root="./prof") + module = debug_runtime.create(graph, lib, dev, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") - module = runtime.create(graph, lib, ctx) + module = runtime.create(graph, lib, dev) logger.debug("load params into the runtime module") module.load_params(params) @@ -390,7 +390,7 @@ def run_module( module.run() # create the module time evaluator (returns a function) - timer = module.module.time_evaluator("run", ctx, 1, repeat=repeat) + timer = module.module.time_evaluator("run", dev, 1, repeat=repeat) # call the evaluator function to invoke the module and save execution times prof_result = timer() # collect a list of execution times from the profiling results diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py index 717b6e4806712..8987883abafbc 100644 --- a/python/tvm/micro/session.py +++ b/python/tvm/micro/session.py @@ -143,7 +143,7 @@ def __enter__(self): int(timeouts.session_established_timeout_sec * 1e6), ) ) - self.context = self._rpc.cpu(0) + self.device = self._rpc.cpu(0) return self except: @@ -155,7 +155,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback): self.transport.__exit__(exc_type, exc_value, exc_traceback) -def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx): +def lookup_remote_linked_param(mod, storage_id, template_tensor, device): """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module. This function signature matches the signature built by @@ -170,8 +170,8 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx): A DLTensor containing metadata that should be filled-in to the returned NDArray. This function should mostly not inspect this, and just pass it along to NDArrayFromRemoteOpaqueHandle. - ctx : TVMContext - The remote CPU context to be used with the returned NDArray. + device : Device + The remote CPU device to be used with the returned NDArray. Returns ------- @@ -188,12 +188,12 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx): return None return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")( - mod, remote_data, template_tensor, ctx, None + mod, remote_data, template_tensor, device, None ) -def create_local_graph_runtime(graph_json_str, mod, ctx): - """Create a local graph runtime driving execution on the remote CPU context given. +def create_local_graph_runtime(graph_json_str, mod, device): + """Create a local graph runtime driving execution on the remote CPU device given. Parameters ---------- @@ -203,23 +203,23 @@ def create_local_graph_runtime(graph_json_str, mod, ctx): mod : tvm.runtime.Module The remote module containing functions in graph_json_str. - ctx : tvm.Context - The remote CPU execution context. + device : tvm.runtime.Device + The remote CPU execution device. Returns ------- tvm.contrib.GraphRuntime : A local graph runtime instance that executes on the remote device. """ - device_type_id = [ctx.device_type, ctx.device_id] + device_type_id = [device.device_type, device.device_id] fcreate = get_global_func("tvm.graph_runtime.create") return graph_runtime.GraphModule( fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id) ) -def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None): - """Create a local debug runtime driving execution on the remote CPU context given. +def create_local_debug_runtime(graph_json_str, mod, device, dump_root=None): + """Create a local debug runtime driving execution on the remote CPU device given. Parameters ---------- @@ -229,8 +229,8 @@ def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None): mod : tvm.runtime.Module The remote module containing functions in graph_json_str. - ctx : tvm.Context - The remote CPU execution context. + device : tvm.runtime.Device + The remote CPU execution device. dump_root : Optional[str] If given, passed as dump_root= to GraphModuleDebug. @@ -240,11 +240,11 @@ def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None): tvm.contrib.GraphRuntime : A local graph runtime instance that executes on the remote device. """ - device_type_id = [ctx.device_type, ctx.device_id] + device_type_id = [device.device_type, device.device_id] fcreate = get_global_func("tvm.graph_runtime_debug.create") return debug_runtime.GraphModuleDebug( fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id), - [ctx], + [device], graph_json_str, dump_root=dump_root, ) diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py index 48e9ce0643a97..3928f77b2607f 100644 --- a/python/tvm/relay/analysis/analysis.py +++ b/python/tvm/relay/analysis/analysis.py @@ -28,7 +28,7 @@ from .feature import Feature -def context_analysis(mod, default_context): +def context_analysis(mod, default_device): """Analyze the device context information of each IR node in a Relay program. @@ -37,10 +37,10 @@ def context_analysis(mod, default_context): mod : tvm.IRModule The input module. - default_context : tvm.runtime.TVMContext + default_device : tvm.runtime.Device The default context allocated to an IR node. """ - return _ffi_api.ContextAnalysis(mod, default_context) + return _ffi_api.ContextAnalysis(mod, default_device) def post_order_visit(expr, fvisit): @@ -433,7 +433,7 @@ def get_calibration_data(mod, data): mod = _ffi_api.get_calibrate_module(mod) mod = transform.Inline()(mod) - ref_ex = build_module.create_executor("graph", mod=mod, ctx=cpu(0)) + ref_ex = build_module.create_executor("graph", mod=mod, device=cpu(0)) ref_res = ref_ex.evaluate()(**data) calib_data = {} diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py index 81ab4cb4de250..ec679aee894ca 100644 --- a/python/tvm/relay/backend/graph_runtime_codegen.py +++ b/python/tvm/relay/backend/graph_runtime_codegen.py @@ -87,7 +87,7 @@ def codegen(self, func): params = {} for key in param_names: arr = self._get_param_by_name(key) - param = empty(arr.shape, dtype=arr.dtype, ctx=arr.ctx) + param = empty(arr.shape, dtype=arr.dtype, device=arr.device) arr.copyto(param) params[key] = param return graph_json, lowered_func, params diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py index ba09094afca1f..b62fca86668d5 100644 --- a/python/tvm/relay/backend/interpreter.py +++ b/python/tvm/relay/backend/interpreter.py @@ -191,16 +191,16 @@ class Interpreter(Executor): mod : tvm.IRModule The module to support the execution. - ctx : tvmContext - The runtime context to run the code on. + device : Device + The runtime device to run the code on. target : tvm.Target The target option to build the function. """ - def __init__(self, mod, ctx, target): + def __init__(self, mod, device, target): self.mod = mod - self.ctx = ctx + self.device = device self.target = target def optimize(self): @@ -253,7 +253,7 @@ def _interp_wrapper(*args, **kwargs): mod = self.optimize() opt_expr = Call(mod["main"], relay_args) - _intrp = _backend.CreateInterpreter(mod, self.ctx, self.target) + _intrp = _backend.CreateInterpreter(mod, self.device, self.target) return _intrp(opt_expr) return _interp_wrapper diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py index 0f7875a9202e6..7e0d4acc5453b 100644 --- a/python/tvm/relay/backend/vm.py +++ b/python/tvm/relay/backend/vm.py @@ -188,11 +188,11 @@ def _update_target(self, target): raise ValueError("Target is not set in env or passed as argument.") tgts = {} if isinstance(target, (str, tvm.target.Target)): - dev_type = tvm.tir.IntImm("int32", tvm.nd.context(str(target)).device_type) + dev_type = tvm.tir.IntImm("int32", tvm.nd.device(str(target)).device_type) tgts[dev_type] = tvm.target.Target(target) elif isinstance(target, dict): for dev, tgt in target.items(): - dev_type = tvm.tir.IntImm("int32", tvm.nd.context(dev).device_type) + dev_type = tvm.tir.IntImm("int32", tvm.nd.device(dev).device_type) tgts[dev_type] = tvm.target.Target(tgt) else: raise TypeError( @@ -241,21 +241,21 @@ class VMExecutor(Executor): mod : :py:class:`~tvm.IRModule` The module to support the execution. - ctx : :py:class:`~tvmContext` - The runtime context to run the code on. + device : :py:class:`~tvm.runtime.Device` + The runtime device to run the code on. target : :py:class:`Target` The target option to build the function. """ - def __init__(self, mod, ctx, target): + def __init__(self, mod, device, target): if mod is None: raise RuntimeError("Must provide module to get VM executor.") self.mod = mod - self.ctx = ctx + self.device = device self.target = target self.executable = compile(mod, target) - self.vm = vm_rt.VirtualMachine(self.executable, ctx) + self.vm = vm_rt.VirtualMachine(self.executable, device) def _make_executor(self, expr=None): main = self.mod["main"] diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 8e69d288df12d..efe495e816a22 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -45,11 +45,11 @@ def _update_target(target): tgts = {} if isinstance(target, (str, Target)): - dev_type = tvm_expr.IntImm("int32", _nd.context(str(target)).device_type) + dev_type = tvm_expr.IntImm("int32", _nd.device(str(target)).device_type) tgts[dev_type] = Target(target) elif isinstance(target, dict): for dev, tgt in target.items(): - dev_type = tvm_expr.IntImm("int32", _nd.context(dev).device_type) + dev_type = tvm_expr.IntImm("int32", _nd.device(dev).device_type) tgts[dev_type] = Target(tgt) else: raise TypeError( @@ -373,17 +373,17 @@ class GraphExecutor(_interpreter.Executor): mod : :py:class:`~tvm.IRModule` The module to support the execution. - ctx : :py:class:`TVMContext` - The runtime context to run the code on. + device : :py:class:`Device` + The runtime device to run the code on. target : :py:class:`Target` The target option to build the function. """ - def __init__(self, mod, ctx, target): + def __init__(self, mod, device, target): assert mod is not None self.mod = mod - self.ctx = ctx + self.device = device self.target = target def _make_executor(self, expr=None): @@ -394,7 +394,7 @@ def _make_executor(self, expr=None): if _ty.is_dynamic(ret_type): raise ValueError("Graph Runtime only supports static graphs, got output type", ret_type) mod = build(self.mod, target=self.target) - gmodule = _graph_rt.GraphModule(mod["default"](self.ctx)) + gmodule = _graph_rt.GraphModule(mod["default"](self.device)) def _unflatten(flat_iter, cur_type): if isinstance(cur_type, _ty.TensorType): @@ -423,7 +423,7 @@ def _graph_wrapper(*args, **kwargs): return _graph_wrapper -def create_executor(kind="debug", mod=None, ctx=None, target="llvm"): +def create_executor(kind="debug", mod=None, device=None, target="llvm"): """Factory function to create an executor. Example @@ -450,8 +450,8 @@ def create_executor(kind="debug", mod=None, ctx=None, target="llvm"): mod : :py:class:`~tvm.IRModule` The Relay module containing collection of functions - ctx : :py:class:`tvmContext` - The context to execute the code. + device : :py:class:`Device` + The device to execute the code. target : :py:class:`tvm.Target` The corresponding context @@ -462,17 +462,17 @@ def create_executor(kind="debug", mod=None, ctx=None, target="llvm"): """ if mod is None: mod = IRModule() - if ctx is not None: - assert ctx.device_type == _nd.context(str(target), 0).device_type + if device is not None: + assert device.device_type == _nd.device(str(target), 0).device_type else: - ctx = _nd.context(str(target), 0) + device = _nd.device(str(target), 0) if isinstance(target, str): target = Target(target) if kind == "debug": - return _interpreter.Interpreter(mod, ctx, target) + return _interpreter.Interpreter(mod, device, target) if kind == "graph": - return GraphExecutor(mod, ctx, target) + return GraphExecutor(mod, device, target) if kind == "vm": - return VMExecutor(mod, ctx, target) + return VMExecutor(mod, device, target) raise RuntimeError("unknown execution strategy: {0}".format(kind)) diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py index 2db420a409924..7bb70cec5ee5f 100644 --- a/python/tvm/relay/frontend/common.py +++ b/python/tvm/relay/frontend/common.py @@ -535,8 +535,8 @@ def infer_value(input_val, params, mod=None): func = _function.Function(analysis.free_vars(input_val), input_val) with tvm.transform.PassContext(opt_level=0): lib = tvm.relay.build(func, target="llvm", params=params) - ctx = tvm.cpu(0) - m = graph_runtime.GraphModule(lib["default"](ctx)) + dev = tvm.cpu(0) + m = graph_runtime.GraphModule(lib["default"](dev)) m.run() return m.get_output(0) except Exception: @@ -544,7 +544,7 @@ def infer_value(input_val, params, mod=None): mod["main"] = _function.Function(analysis.free_vars(input_val), input_val) else: mod = IRModule.from_expr(input_val) - exc = tvm.relay.create_executor("debug", mod=mod, ctx=tvm.cpu(), target="llvm") + exc = tvm.relay.create_executor("debug", mod=mod, device=tvm.cpu(), target="llvm") inputs = [] for param in mod["main"].params: inputs.append(params[param.name_hint]) diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py index 0ab1a0ba9d688..5c75688b909df 100644 --- a/python/tvm/relay/op/annotation/annotation.py +++ b/python/tvm/relay/op/annotation/annotation.py @@ -16,7 +16,7 @@ # under the License. """Annotation operations.""" from tvm.runtime import ndarray as _nd -from tvm.runtime import TVMContext as _TVMContext +from tvm.runtime import Device as _Device from . import _make from .. import op as reg @@ -30,7 +30,7 @@ def on_device(data, device): data : tvm.relay.Expr The expression to be annotated. - device : Union[:py:class:`TVMContext`, str] + device : Union[:py:class:`Device`, str] The device type to annotate. Returns @@ -38,13 +38,13 @@ def on_device(data, device): result : tvm.relay.Expr The annotated expression. """ - if isinstance(device, _TVMContext): + if isinstance(device, _Device): device = device.device_type elif isinstance(device, str): - device = _nd.context(device).device_type + device = _nd.device(device).device_type else: raise ValueError( - "device is expected to be the type of TVMContext or " + "device is expected to be the type of Device or " "str, but received %s" % (type(device)) ) return _make.on_device(data, device) diff --git a/python/tvm/relay/op/memory/memory.py b/python/tvm/relay/op/memory/memory.py index 57aa7e4231477..9dae23d5b65eb 100644 --- a/python/tvm/relay/op/memory/memory.py +++ b/python/tvm/relay/op/memory/memory.py @@ -47,7 +47,7 @@ def alloc_tensor(storage, offset, shape, dtype="float32", assert_shape=None): return _make.alloc_tensor(storage, offset, shape, dtype, assert_shape) -def alloc_storage(size, alignment, ctx, dtype_hint="float32"): +def alloc_storage(size, alignment, device, dtype_hint="float32"): """Allocate a piece of tensor storage. Parameters @@ -56,15 +56,17 @@ def alloc_storage(size, alignment, ctx, dtype_hint="float32"): The size of the allocation. alignment : tvm.relay.Expr The alignment of the allocation. - dtype : str - The dtype_hint of the allocation. + device : tvm.runtime.Device + The device of the allocation. + dtype_hint : str + The dtype hint of the allocation. Returns ------- result : tvm.relay.Expr The alloc_storage expression. """ - return _make.alloc_storage(size, alignment, ctx, dtype_hint) + return _make.alloc_storage(size, alignment, device, dtype_hint) def flatten_tuple_type(ty): diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py index 5b011043f5883..6b9ac30d3a3aa 100644 --- a/python/tvm/relay/op/tensor.py +++ b/python/tvm/relay/op/tensor.py @@ -17,7 +17,7 @@ """Basic tensor operations.""" # pylint: disable=redefined-builtin, unused-argument from tvm.runtime import ndarray as _nd -from tvm.runtime import TVMContext as _TVMContext +from tvm.runtime import Device as _Device from tvm.te.hybrid import script from . import _make @@ -1160,7 +1160,7 @@ def copy_shape_func(attrs, inputs, _): def device_copy(data, src_dev, dst_dev): """Copy data from the source device to the destination device. This - operator helps data transferring between difference contexts for + operator helps data transferring between difference devices for heterogeneous execution. Parameters @@ -1168,10 +1168,10 @@ def device_copy(data, src_dev, dst_dev): data : tvm.relay.Expr The tensor to be copied. - src_dev : Union[:py:class:`TVMContext`, str] + src_dev : Union[:py:class:`Device`, str] The source device where the data is copied from. - dst_dev : Union[:py:class:`TVMContext`, str] + dst_dev : Union[:py:class:`Device`, str] The destination device where the data is copied to. Returns @@ -1179,23 +1179,23 @@ def device_copy(data, src_dev, dst_dev): result : tvm.relay.Expr The copied result. """ - if isinstance(src_dev, _TVMContext): + if isinstance(src_dev, _Device): src_dev = src_dev.device_type elif isinstance(src_dev, str): - src_dev = _nd.context(src_dev).device_type + src_dev = _nd.device(src_dev).device_type else: raise ValueError( - "src_dev is expected to be the type of TVMContext or " + "src_dev is expected to be the type of Device or " "str, but received %s" % (type(src_dev)) ) - if isinstance(dst_dev, _TVMContext): + if isinstance(dst_dev, _Device): dst_dev = dst_dev.device_type elif isinstance(dst_dev, str): - dst_dev = _nd.context(dst_dev).device_type + dst_dev = _nd.device(dst_dev).device_type else: raise ValueError( - "dst_dev is expected to be the type of TVMContext or " + "dst_dev is expected to be the type of Device or " "str, but received %s" % (type(dst_dev)) ) return _make.device_copy(data, src_dev, dst_dev) diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py index 8461daa0eb5a8..20afa1909ac96 100644 --- a/python/tvm/relay/quantize/_calibrate.py +++ b/python/tvm/relay/quantize/_calibrate.py @@ -39,14 +39,14 @@ def _get_profile_runtime(mod): if tvm.target.Target.current(): target = tvm.target.Target.current() - ctx = tvm.context(target.kind.name) + dev = tvm.device(target.kind.name) else: target = "llvm" - ctx = tvm.context(target) + dev = tvm.device(target) with tvm.transform.PassContext(opt_level=3): lib = _build_module.build(func, target=target) - runtime = graph_runtime.GraphModule(lib["default"](ctx)) + runtime = graph_runtime.GraphModule(lib["default"](dev)) return runtime diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py index f0c79bed12188..e889e9078a848 100644 --- a/python/tvm/relay/testing/__init__.py +++ b/python/tvm/relay/testing/__init__.py @@ -133,8 +133,8 @@ def check_grad( if test_inputs is None: test_inputs = inputs - for target, ctx in enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) # Get analytic gradients. _, grads = intrp.evaluate(bwd_func)(*inputs) diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py index 1d4d8d9e311c0..f275712c77d1d 100644 --- a/python/tvm/relay/testing/init.py +++ b/python/tvm/relay/testing/init.py @@ -176,5 +176,5 @@ def create_workload(net, initializer=None, seed=0): continue init_value = np.zeros(v.concrete_shape).astype(v.dtype) initializer(k, init_value) - params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0)) + params[k] = tvm.nd.array(init_value, device=tvm.cpu(0)) return mod, params diff --git a/python/tvm/relay/transform/memory_plan.py b/python/tvm/relay/transform/memory_plan.py index 7c7685d58f95e..c97053f976aa2 100644 --- a/python/tvm/relay/transform/memory_plan.py +++ b/python/tvm/relay/transform/memory_plan.py @@ -26,7 +26,7 @@ from .. import op, expr from ..function import Function from ... import register_func, ir, cpu -from ..._ffi.runtime_ctypes import TVMContext +from ..._ffi.runtime_ctypes import Device from ... import IRModule from .. import transform from . import function_pass @@ -54,7 +54,7 @@ class Region: size: expr.Expr alignment: Optional[expr.Expr] dtype: Optional[str] - ctx: TVMContext + device: Device offsets: Dict[expr.Var, Tuple[expr.Expr, expr.Expr]] @staticmethod @@ -69,7 +69,7 @@ def grow( old_storage: expr.Var, size: expr.Expr, alignment: expr.Expr, - ctx: TVMContext, + dev: Device, dtype: str, ) -> None: """Grow the region by a given allocation as well as track the old storage @@ -87,13 +87,14 @@ def grow( else: self.alignment = alignment - if self.ctx: + if self.device: assert ( - self.ctx.device_type == ctx.device_type and self.ctx.device_id == ctx.device_id - ), "must have matching context" + self.device.device_type == dev.device_type + and self.device.device_id == dev.device_id + ), "must have matching device" else: - assert ctx - self.ctx = ctx + assert dev + self.device = dev new_size = ( (size + self.alignment - expr.const(1, "int64")) / self.alignment * self.alignment @@ -116,8 +117,8 @@ def to_expr(self, body: expr.Expr) -> expr.Expr: all offset computations. """ - if self.ctx is None: - self.ctx = cpu(0) + if self.device is None: + self.device = cpu(0) # Generate bindings for each and every size computation # we must do this to maintain ANF. @@ -128,7 +129,7 @@ def to_expr(self, body: expr.Expr) -> expr.Expr: bindings.append((total_size, self.size)) # Allocate the entire region with a single call. - alloc = op.memory.alloc_storage(total_size, self.alignment, self.ctx, self.dtype) + alloc = op.memory.alloc_storage(total_size, self.alignment, self.device, self.dtype) bindings.append((self.var, alloc)) # Generate variables which contain all of the offset math. @@ -279,21 +280,21 @@ def process_alloc_storage(self, dynamic_regions, lhs, call): """Process alloc_storage""" size, alignment = call.args dtype = call.attrs.dtype - ctx = TVMContext(call.attrs.device_type, call.attrs.device_id) + dev = Device(call.attrs.device_type, call.attrs.device_id) if not isinstance(size, expr.Constant): self.enter_scope() dynamic_regions.append(lhs) else: # A new scope is created when entering a new region with different - # device context. + # device device. region = self.current_region(dtype) - if region.ctx and region.ctx.device_type != ctx.device_type: + if region.device and region.device.device_type != dev.device_type: self.enter_scope() dynamic_regions.append(lhs) region = self.current_region(dtype) - region.grow(lhs, size, alignment, ctx, dtype) + region.grow(lhs, size, alignment, dev, dtype) return lhs, region.var def process_alloc_tensor(self, lhs, call): diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py index a50f3b856800d..7196f209712ee 100644 --- a/python/tvm/rpc/client.py +++ b/python/tvm/rpc/client.py @@ -72,8 +72,8 @@ def get_function(self, name): """ return self._sess.get_function(name) - def context(self, dev_type, dev_id=0): - """Construct a remote context. + def device(self, dev_type, dev_id=0): + """Construct a remote device. Parameters ---------- @@ -83,14 +83,14 @@ def context(self, dev_type, dev_id=0): Returns ------- - ctx: TVMContext - The corresponding encoded remote context. + dev: Device + The corresponding encoded remote device. """ - ctx = nd.context(dev_type, dev_id) + dev = nd.device(dev_type, dev_id) encode = (self._tbl_index + 1) * base.RPC_SESS_MASK - ctx.device_type += encode - ctx._rpc_sess = self - return ctx + dev.device_type += encode + dev._rpc_sess = self + return dev def upload(self, data, target=None): """Upload file to remote runtime temp folder @@ -199,35 +199,35 @@ def download_linked_module(self, path): def cpu(self, dev_id=0): """Construct CPU device.""" - return self.context(1, dev_id) + return self.device(1, dev_id) def gpu(self, dev_id=0): """Construct GPU device.""" - return self.context(2, dev_id) + return self.device(2, dev_id) def cl(self, dev_id=0): """Construct OpenCL device.""" - return self.context(4, dev_id) + return self.device(4, dev_id) def vulkan(self, dev_id=0): """Construct Vulkan device.""" - return self.context(7, dev_id) + return self.device(7, dev_id) def metal(self, dev_id=0): """Construct Metal device.""" - return self.context(8, dev_id) + return self.device(8, dev_id) def ext_dev(self, dev_id=0): """Construct extension device.""" - return self.context(12, dev_id) + return self.device(12, dev_id) def hexagon(self, dev_id=0): """Construct Hexagon device.""" - return self.context(14, dev_id) + return self.device(14, dev_id) def webgpu(self, dev_id=0): """Construct WebGPU device.""" - return self.context(15, dev_id) + return self.device(15, dev_id) class LocalSession(RPCSession): diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py index 7d58af70afe1c..fe2ea2d593b58 100644 --- a/python/tvm/runtime/__init__.py +++ b/python/tvm/runtime/__init__.py @@ -20,12 +20,12 @@ from .packed_func import PackedFunc from .object import Object from .object_generic import ObjectGeneric, ObjectTypes -from .ndarray import NDArray, DataType, DataTypeCode, TVMContext +from .ndarray import NDArray, DataType, DataTypeCode, Device from .module import Module # function exposures from .object_generic import convert_to_object, convert, const -from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl +from .ndarray import device, cpu, gpu, opencl, cl, vulkan, metal, mtl from .ndarray import vpi, rocm, ext_dev, micro_dev from .module import load_module, enabled, system_lib from .container import String diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 09bef9ecbd6af..41d60683aa3b8 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -168,7 +168,7 @@ def save(self, file_name, fmt=""): """ _ffi_api.ModuleSaveToFile(self, file_name, fmt) - def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_preproc=""): + def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f_preproc=""): """Get an evaluator that measures time cost of running function. Parameters @@ -176,8 +176,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f func_name: str The name of the function in the module. - ctx: TVMContext - The context we should run this function on. + dev: Device + The device we should run this function on. number: int The number of times to run this function for taking average. @@ -215,8 +215,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f feval = _ffi_api.RPCTimeEvaluator( self, func_name, - ctx.device_type, - ctx.device_id, + dev.device_type, + dev.device_id, number, repeat, min_repeat_ms, diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 5c60515e34485..980f70d8dc536 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -14,14 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name, unused-import +# pylint: disable=invalid-name, unused-import, redefined-outer-name """Runtime NDArray API""" import ctypes import numpy as np import tvm._ffi from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE -from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle +from tvm._ffi.runtime_ctypes import DataType, Device, TVMArray, TVMArrayHandle from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t from . import _ffi_api @@ -58,14 +58,9 @@ def dtype(self): return str(self.handle.contents.dtype) @property - def ctx(self): - """context of this array""" - return self.handle.contents.ctx - - @property - def context(self): - """context of this array""" - return self.ctx + def device(self): + """Device of this array""" + return self.handle.contents.device def __hash__(self): return ctypes.cast(self.handle, ctypes.c_void_p).value @@ -158,7 +153,7 @@ def copyfrom(self, source_array): return self def __repr__(self): - res = "\n".format(self.shape, self.context) + res = "\n".format(self.shape, self.device) res += self.asnumpy().__repr__() return res @@ -196,14 +191,14 @@ def copyto(self, target): """ if isinstance(target, NDArrayBase): return self._copyto(target) - if isinstance(target, TVMContext): + if isinstance(target, Device): res = empty(self.shape, self.dtype, target) return self._copyto(res) raise ValueError("Unsupported target type %s" % str(type(target))) -def context(dev_type, dev_id=0): - """Construct a TVM context with given device type and id. +def device(dev_type, dev_id=0): + """Construct a TVM device with given device type and id. Parameters ---------- @@ -215,29 +210,29 @@ def context(dev_type, dev_id=0): Returns ------- - ctx: tvm.runtime.TVMContext - The corresponding context. + dev: tvm.runtime.Device + The corresponding device. Examples -------- - Context can be used to create reflection of context by + Device can be used to create reflection of device by string representation of the device type. .. code-block:: python - assert tvm.context("cpu", 1) == tvm.cpu(1) - assert tvm.context("gpu", 0) == tvm.gpu(0) - assert tvm.context("cuda", 0) == tvm.gpu(0) + assert tvm.device("cpu", 1) == tvm.cpu(1) + assert tvm.device("gpu", 0) == tvm.gpu(0) + assert tvm.device("cuda", 0) == tvm.gpu(0) """ if isinstance(dev_type, string_types): if "-device=micro_dev" in dev_type: - dev_type = TVMContext.STR2MASK["micro_dev"] + dev_type = Device.STR2MASK["micro_dev"] else: dev_type = dev_type.split()[0] - if dev_type not in TVMContext.STR2MASK: + if dev_type not in Device.STR2MASK: raise ValueError("Unknown device type %s" % dev_type) - dev_type = TVMContext.STR2MASK[dev_type] - return TVMContext(dev_type, dev_id) + dev_type = Device.STR2MASK[dev_type] + return Device(dev_type, dev_id) def numpyasarray(np_data): @@ -252,11 +247,11 @@ def numpyasarray(np_data): arr.dtype = DataType(np.dtype(data.dtype).name) arr.ndim = data.ndim # CPU device - arr.ctx = context(1, 0) + arr.device = device(1, 0) return arr, shape -def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None): +def empty(shape, dtype="float32", device=device(1, 0), mem_scope=None): """Create an empty array given shape and device Parameters @@ -267,8 +262,8 @@ def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None): dtype : type or str The data type of the array. - ctx : TVMContext - The context of the array. + device : Device + The device of the array. mem_scope : Optional[str] The memory scope of the array. @@ -289,7 +284,7 @@ def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None): shape_ptr = ctypes.cast(ptr, ctypes.c_void_p) ndim = len(shape_imm) dtype = DataType(dtype) - arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope) + arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, device, mem_scope) return arr @@ -322,10 +317,10 @@ def cpu(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(1, dev_id) + return Device(1, dev_id) def gpu(dev_id=0): @@ -338,10 +333,10 @@ def gpu(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(2, dev_id) + return Device(2, dev_id) def rocm(dev_id=0): @@ -354,10 +349,10 @@ def rocm(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(10, dev_id) + return Device(10, dev_id) def opencl(dev_id=0): @@ -370,10 +365,10 @@ def opencl(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(4, dev_id) + return Device(4, dev_id) def metal(dev_id=0): @@ -386,10 +381,10 @@ def metal(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(8, dev_id) + return Device(8, dev_id) def vpi(dev_id=0): @@ -402,10 +397,10 @@ def vpi(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(9, dev_id) + return Device(9, dev_id) def vulkan(dev_id=0): @@ -418,10 +413,10 @@ def vulkan(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(7, dev_id) + return Device(7, dev_id) def ext_dev(dev_id=0): @@ -434,15 +429,15 @@ def ext_dev(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device Note ---- This API is reserved for quick testing of new device by plugin device API as ext_dev. """ - return TVMContext(12, dev_id) + return Device(12, dev_id) def micro_dev(dev_id=0): @@ -455,10 +450,10 @@ def micro_dev(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(13, dev_id) + return Device(13, dev_id) def hexagon(dev_id=0): @@ -471,10 +466,10 @@ def hexagon(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(14, dev_id) + return Device(14, dev_id) def webgpu(dev_id=0): @@ -487,17 +482,17 @@ def webgpu(dev_id=0): Returns ------- - ctx : TVMContext - The created context + dev : Device + The created device """ - return TVMContext(15, dev_id) + return Device(15, dev_id) cl = opencl mtl = metal -def array(arr, ctx=cpu(0)): +def array(arr, device=cpu(0)): """Create an array from source arr. Parameters @@ -505,8 +500,8 @@ def array(arr, ctx=cpu(0)): arr : numpy.ndarray The array to be copied from - ctx : TVMContext, optional - The device context to create the array + device : Device, optional + The device device to create the array Returns ------- @@ -515,7 +510,7 @@ def array(arr, ctx=cpu(0)): """ if not isinstance(arr, (np.ndarray, NDArray)): arr = np.array(arr) - return empty(arr.shape, arr.dtype, ctx).copyfrom(arr) + return empty(arr.shape, arr.dtype, device).copyfrom(arr) # Register back to FFI diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py index 5df10e55f8487..e70ef320289dd 100644 --- a/python/tvm/runtime/profiler_vm.py +++ b/python/tvm/runtime/profiler_vm.py @@ -32,15 +32,15 @@ def enabled(): class VirtualMachineProfiler(vm.VirtualMachine): """Relay profile VM runtime.""" - def __init__(self, exe, ctx, memory_cfg=None): - super(VirtualMachineProfiler, self).__init__(exe, ctx, memory_cfg) + def __init__(self, exe, device, memory_cfg=None): + super(VirtualMachineProfiler, self).__init__(exe, device, memory_cfg) self.module = _ffi_api._VirtualMachineDebug(exe.module) self._init = self.module["init"] self._invoke = self.module["invoke"] self._get_stat = self.module["get_stat"] self._set_input = self.module["set_input"] self._reset = self.module["reset"] - self._setup_ctx(ctx, memory_cfg) + self._setup_device(device, memory_cfg) def get_stat(self, sort_by_time=True): """Get the statistics of executed ops. diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index d641e52d71842..a503da53c465e 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -33,7 +33,7 @@ def _convert(arg, cargs): if isinstance(arg, Object): cargs.append(arg) elif isinstance(arg, np.ndarray): - nd_arr = tvm.nd.array(arg, ctx=tvm.cpu(0)) + nd_arr = tvm.nd.array(arg, device=tvm.cpu(0)) cargs.append(nd_arr) elif isinstance(arg, tvm.runtime.NDArray): cargs.append(arg) @@ -44,7 +44,7 @@ def _convert(arg, cargs): cargs.append(container.tuple_object(field_args)) elif isinstance(arg, (_base.numeric_types, bool)): dtype = "int32" if isinstance(arg, (int, bool)) else "float32" - value = tvm.nd.array(np.array(arg, dtype=dtype), ctx=tvm.cpu(0)) + value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0)) cargs.append(value) else: raise TypeError("Unsupported type: %s" % (type(arg))) @@ -115,7 +115,7 @@ def save(self): f = relay.Function([x], x + x) mod = tvm.IRModule({"main": f}) # create a Relay VM. - ctx = tvm.cpu() + dev = tvm.cpu() target = "llvm" executable = relay.vm.compile(mod, target) code, lib = executable.save() @@ -131,7 +131,7 @@ def save(self): des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib) # execute the deserialized executable. x_data = np.random.rand(10, 10).astype('float32') - des_vm = tvm.runtime.vm.VirtualMachine(des_exec, ctx) + des_vm = tvm.runtime.vm.VirtualMachine(des_exec, dev) res = des_vm.run(x_data) print(res.asnumpy()) """ @@ -283,14 +283,14 @@ class VirtualMachine(object): exe : Executable The VM executable. - ctx : tvm.runtime.TVMContext or List[tvm.runtime.TVMContext] - The context to deploy the module + device : tvm.runtime.Device or List[tvm.runtime.Device] + The device to deploy the module - memory_cfg : str or Dict[tvm.runtime.TVMContext, str], optional + memory_cfg : str or Dict[tvm.runtime.Device, str], optional Config the type of memory allocator. The allocator type can be ["naive", - "pooled"]. If memory_cfg is None, all contexts will use pooled allocator - by default. If memory_cfg is string, all contexts will use the specified - allocator type. If memory_cfg is a dict, each context uses the allocator + "pooled"]. If memory_cfg is None, all devices will use pooled allocator + by default. If memory_cfg is string, all devices will use the specified + allocator type. If memory_cfg is a dict, each device uses the allocator type specified in the dict, or pooled allocator if not specified in the dict. """ @@ -298,7 +298,7 @@ class VirtualMachine(object): NAIVE_ALLOCATOR = 1 POOLED_ALLOCATOR = 2 - def __init__(self, exe, ctx, memory_cfg=None): + def __init__(self, exe, device, memory_cfg=None): if not isinstance(exe, Executable): raise TypeError( "exe is expected to be the type of Executable, " @@ -309,22 +309,22 @@ def __init__(self, exe, ctx, memory_cfg=None): self._init = self.module["init"] self._invoke = self.module["invoke"] self._set_input = self.module["set_input"] - self._setup_ctx(ctx, memory_cfg) + self._setup_device(device, memory_cfg) - def _setup_ctx(self, ctx, memory_cfg): - """Init context and allocators.""" - ctxs = ctx - if not isinstance(ctx, (list, tuple)): - if not isinstance(ctx, tvm.runtime.TVMContext): + def _setup_device(self, dev, memory_cfg): + """Init devices and allocators.""" + devs = dev + if not isinstance(dev, (list, tuple)): + if not isinstance(dev, tvm.runtime.Device): raise TypeError( - "ctx is expected to be TVMContext or \ - List[TVMContext]" + "dev is expected to be Device or \ + List[Device]" ) - ctxs = [ctx] + devs = [dev] # CPU is required for executing shape functions - if not any(c.device_type == tvm.cpu().device_type for c in ctxs): - ctxs.append(tvm.cpu()) + if not any(c.device_type == tvm.cpu().device_type for c in devs): + devs.append(tvm.cpu()) default_alloc_type = VirtualMachine.POOLED_ALLOCATOR if memory_cfg is None: @@ -340,10 +340,10 @@ def _setup_ctx(self, ctx, memory_cfg): + "but received {}".format(type(memory_cfg)) ) init_args = [] - for context in ctxs: - init_args.append(context.device_type) - init_args.append(context.device_id) - alloc_type = memory_cfg[context] if context in memory_cfg else default_alloc_type + for device in devs: + init_args.append(device.device_type) + init_args.append(device.device_id) + alloc_type = memory_cfg[device] if device in memory_cfg else default_alloc_type init_args.append(alloc_type) self._init(*init_args) diff --git a/python/tvm/testing.py b/python/tvm/testing.py index 1cb43b29c521d..df0f2afc583f4 100644 --- a/python/tvm/testing.py +++ b/python/tvm/testing.py @@ -375,7 +375,7 @@ def _get_targets(): if len(dev) == 0: continue target_kind = dev.split()[0] - if tvm.runtime.enabled(target_kind) and tvm.context(target_kind, 0).exist: + if tvm.runtime.enabled(target_kind) and tvm.device(target_kind, 0).exist: targets.add(dev) if len(targets) == 0: logging.warning( @@ -450,7 +450,7 @@ def enabled_targets(): targets: list A list of pairs of all enabled devices and the associated context """ - return [(tgt, tvm.context(tgt)) for tgt in _get_targets()] + return [(tgt, tvm.device(tgt)) for tgt in _get_targets()] def _compose(args, decs): @@ -703,7 +703,7 @@ def parametrize_targets(*args): Parameters ---------- f : function - Function to parametrize. Must be of the form `def test_xxxxxxxxx(target, ctx)`:, + Function to parametrize. Must be of the form `def test_xxxxxxxxx(target, device)`:, where `xxxxxxxxx` is any name. targets : list[str], optional Set of targets to run against. If not supplied, @@ -712,23 +712,23 @@ def parametrize_targets(*args): Example ------- >>> @tvm.testing.parametrize - >>> def test_mytest(target, ctx): + >>> def test_mytest(target, dev): >>> ... # do something Or >>> @tvm.testing.parametrize("llvm", "cuda") - >>> def test_mytest(target, ctx): + >>> def test_mytest(target, dev): >>> ... # do something """ def wrap(targets): def func(f): params = [ - pytest.param(target, tvm.context(target, 0), marks=_target_to_requirement(target)) + pytest.param(target, tvm.device(target, 0), marks=_target_to_requirement(target)) for target in targets ] - return pytest.mark.parametrize("target,ctx", params)(f) + return pytest.mark.parametrize("target,dev", params)(f) return func diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py index 95966a5050e1d..d7067a5bdd940 100644 --- a/python/tvm/tir/buffer.py +++ b/python/tvm/tir/buffer.py @@ -217,10 +217,10 @@ def decl_buffer( Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") s = te.create_schedule(C.op) fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb}) - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index cc29d4fb54ad6..c83dae0d3b96d 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -871,10 +871,10 @@ def non_max_suppression( np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "cuda") - ctx = tvm.gpu(0) - tvm_data = tvm.nd.array(np_data, ctx) - tvm_valid_count = tvm.nd.array(np_valid_count, ctx) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) + dev = tvm.gpu(0) + tvm_data = tvm.nd.array(np_data, dev) + tvm_valid_count = tvm.nd.array(np_valid_count, dev) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) f(tvm_data, tvm_valid_count, tvm_out) """ data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py index a09a5f3f4ae32..b6c0b3fa59306 100644 --- a/python/tvm/topi/random/kernel.py +++ b/python/tvm/topi/random/kernel.py @@ -430,14 +430,14 @@ def gen_ir(gen_ptr, out_left_ptr, out_right_ptr): ) -def threefry_test_wrapping(target, ctx): +def threefry_test_wrapping(target, device): """Test that unsigned arithmetic wraps on overflow. Parameters ---------- target : tvm.target.Target Target to run against - ctx : tvm.runtime.TVMContext + device : tvm.runtime.Device Context to run the test on Returns @@ -463,6 +463,6 @@ def gen_ir(out_ptr): [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out] ) s = tvm.te.create_schedule([f.op]) - out_ary = tvm.nd.array(np.ones((1,), "uint64"), ctx) + out_ary = tvm.nd.array(np.ones((1,), "uint64"), device) tvm.build(s, [f], target=target)(out_ary) return out_ary.asnumpy()[0] == 0 diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py index 8964e363b06f4..5b8e33413d65e 100644 --- a/python/tvm/topi/sort.py +++ b/python/tvm/topi/sort.py @@ -104,9 +104,9 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): np_data = np.random.uniform(dshape) s = topi.generic.schedule_argsort(out) f = tvm.build(s, [data, out], "llvm") - ctx = tvm.cpu() - tvm_data = tvm.nd.array(np_data, ctx) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) + dev = tvm.cpu() + tvm_data = tvm.nd.array(np_data, dev) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) f(tvm_data, tvm_out) """ data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py index e4e5e811ab18e..69ffc1482ba1a 100644 --- a/python/tvm/topi/testing/common.py +++ b/python/tvm/topi/testing/common.py @@ -81,7 +81,7 @@ def get_conv2d_nchw_implement(target): return dispatch(target, _conv2d_nchw_implement) -def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule): +def compare_numpy_tvm(inputs, output, target, device, compute, schedule): """Compare a numpy inputs and output of a function to the results of the TVM version. Parameters @@ -92,7 +92,7 @@ def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule): Verified correct function output. target : tvm.target.Target Target to run on. - ctx : tvm.TVMContext + device : tvm.runtime.Device Context to run on. compute : callable Topi compute function to test against. @@ -100,11 +100,11 @@ def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule): Topi scheduling function to test against. """ te_inputs = [tvm.te.placeholder(shape=i.shape, dtype=str(i.dtype)) for i in inputs] - te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), ctx=ctx) + te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), device=device) with tvm.target.Target(target): out = compute(*te_inputs) s = schedule([out]) func = tvm.build(s, te_inputs + [out]) - arys = [tvm.nd.array(x, ctx=ctx) for x in inputs] + arys = [tvm.nd.array(x, device=device) for x in inputs] func(*(arys + [te_out])) assert_allclose(te_out.asnumpy(), output, atol=1e-4, rtol=1e-4) diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py index cbf136a5552c1..8be62a73c09ef 100644 --- a/python/tvm/topi/vision/nms.py +++ b/python/tvm/topi/vision/nms.py @@ -545,10 +545,10 @@ def non_max_suppression( np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") - ctx = tvm.cpu() - tvm_data = tvm.nd.array(np_data, ctx) - tvm_valid_count = tvm.nd.array(np_valid_count, ctx) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) + dev = tvm.cpu() + tvm_data = tvm.nd.array(np_data, dev) + tvm_valid_count = tvm.nd.array(np_valid_count, dev) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] diff --git a/rust/tvm-graph-rt/src/array.rs b/rust/tvm-graph-rt/src/array.rs index deacf11bec04e..8ae716a3266f1 100644 --- a/rust/tvm-graph-rt/src/array.rs +++ b/rust/tvm-graph-rt/src/array.rs @@ -20,7 +20,7 @@ use std::{convert::TryFrom, mem, os::raw::c_void, ptr, slice}; use ndarray; -use tvm_sys::{ffi::DLTensor, Context, DataType}; +use tvm_sys::{ffi::DLTensor, DataType, Device}; use crate::allocator::Allocation; use crate::errors::ArrayError; @@ -151,7 +151,7 @@ impl<'d, 's, T> From<&'d [T]> for Storage<'s> { pub struct Tensor<'a> { /// The bytes which contain the data this `Tensor` represents. pub(crate) data: Storage<'a>, - pub(crate) ctx: Context, + pub(crate) device: Device, pub(crate) dtype: DataType, pub(crate) shape: Vec, // ^ not usize because `typedef int64_t tvm_index_t` in c_runtime_api.h @@ -243,7 +243,7 @@ impl<'a> Tensor<'a> { pub fn to_owned(&self) -> Tensor<'static> { let t = Tensor { data: self.data.to_owned(), - ctx: self.ctx, + device: self.device, dtype: self.dtype, size: self.size, shape: self.shape.clone(), @@ -262,7 +262,7 @@ impl<'a> Tensor<'a> { Tensor { data: storage, - ctx: Context::default(), + device: Device::default(), dtype: dtype_fn(8 * type_width, 1), size: arr.len(), shape: arr.shape().iter().map(|&v| v as i64).collect(), @@ -275,7 +275,7 @@ impl<'a> Tensor<'a> { assert!(!flatten || self.is_contiguous()); DLTensor { data: unsafe { self.data.as_mut_ptr().offset(self.byte_offset) } as *mut c_void, - ctx: self.ctx.into(), + device: self.device.into(), ndim: if flatten { 1 } else { self.shape.len() } as i32, dtype: self.dtype.into(), shape: if flatten { @@ -356,7 +356,7 @@ impl<'a> From for Tensor<'a> { )); Self { data: storage, - ctx: Context::default(), + device: Device::default(), dtype, size, shape, diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs index 83fe37ea7970b..0174b0cacf3ee 100644 --- a/rust/tvm-graph-rt/src/graph.rs +++ b/rust/tvm-graph-rt/src/graph.rs @@ -33,7 +33,7 @@ use serde_json; use tvm_sys::ffi::{DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt}; -use tvm_sys::{ffi::DLTensor, ArgValue, Context, DataType, DeviceType}; +use tvm_sys::{ffi::DLTensor, ArgValue, DataType, Device, DeviceType}; use crate::{errors::*, Module, Storage, Tensor}; @@ -240,7 +240,7 @@ impl<'m, 't> GraphExecutor<'m, 't> { let storage = storages[storage_id].view(); Tensor { data: mem::replace(&mut storages[storage_id], storage), - ctx: Context::default(), + device: Device::default(), dtype, size: shape.iter().product::() as usize, shape, @@ -418,14 +418,14 @@ named! { ) } -// Parses a Context +// Parses a Device named! { - tvm_ctx<&[u8], Context>, + tvm_device<&[u8], Device>, do_parse!( device_type: le_u32 >> device_id: le_i32 >> ( - Context { + Device { device_type: DeviceType::from(device_type), device_id: device_id as usize, } @@ -449,7 +449,7 @@ named! { do_parse!( take!(8) >> le_u64 >> - ctx: tvm_ctx >> + device: tvm_device >> ndim: le_u32 >> dtype: data_type >> shape: count!(map!(le_i64, |sz| sz as i64), ndim as usize) >> @@ -458,7 +458,7 @@ named! { ( Tensor { data: Storage::from(data), - ctx: ctx, + device: device, dtype: dtype, size: shape.iter().product::() as usize, shape: shape, diff --git a/rust/tvm-rt/src/context.rs b/rust/tvm-rt/src/device.rs similarity index 91% rename from rust/tvm-rt/src/context.rs rename to rust/tvm-rt/src/device.rs index b0fea33c6c612..b1cb58cd54cff 100644 --- a/rust/tvm-rt/src/context.rs +++ b/rust/tvm-rt/src/device.rs @@ -24,10 +24,10 @@ use crate::errors::Error; use tvm_sys::ffi; -pub use tvm_sys::context::*; +pub use tvm_sys::device::*; -trait ContextExt { - /// Checks whether the context exists or not. +trait DeviceExt { + /// Checks whether the device exists or not. fn exist(&self) -> bool; fn sync(&self) -> Result<(), Error>; fn max_threads_per_block(&self) -> isize; @@ -57,7 +57,7 @@ crate::external! { fn get_device_attr(device_type: i32, device_id: i32, device_kind: i32) -> i32; } -impl ContextExt for Context { +impl DeviceExt for Device { fn exist(&self) -> bool { let exists = get_device_attr(self.device_type as i32, self.device_id as i32, 0) .expect("should not fail"); @@ -65,7 +65,7 @@ impl ContextExt for Context { exists != 0 } - /// Synchronize the context stream. + /// Synchronize the device stream. fn sync(&self) -> Result<(), Error> { check_call!(ffi::TVMSynchronize( self.device_type as i32, @@ -91,7 +91,7 @@ mod tests { #[test] fn sync() { - let ctx = Context::cpu(0); - assert!(ctx.sync().is_ok()) + let dev = Device::cpu(0); + assert!(dev.sync().is_ok()) } } diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs index 5f9ab16173782..c43264da9e5b1 100644 --- a/rust/tvm-rt/src/lib.rs +++ b/rust/tvm-rt/src/lib.rs @@ -38,7 +38,7 @@ use std::{ }; pub use crate::{ - context::{Context, DeviceType}, + device::{Device, DeviceType}, errors::*, function::Function, module::Module, @@ -92,7 +92,7 @@ pub(crate) fn set_last_error(err: &E) { } pub mod array; -pub mod context; +pub mod device; pub mod errors; pub mod function; pub mod map; @@ -111,7 +111,7 @@ pub fn version() -> &'static str { #[cfg(test)] mod tests { use super::*; - use crate::{ByteArray, Context, DataType}; + use crate::{ByteArray, DataType, Device}; use std::{convert::TryInto, str::FromStr}; #[test] @@ -148,9 +148,9 @@ mod tests { } #[test] - fn ctx() { - let c = Context::from_str("gpu").unwrap(); - let tvm: Context = RetValue::from(c).try_into().unwrap(); + fn device() { + let c = Device::from_str("gpu").unwrap(); + let tvm: Device = RetValue::from(c).try_into().unwrap(); assert_eq!(tvm, c); } } diff --git a/rust/tvm-rt/src/ndarray.rs b/rust/tvm-rt/src/ndarray.rs index 07f783f0ef438..4c48ce50b4f34 100644 --- a/rust/tvm-rt/src/ndarray.rs +++ b/rust/tvm-rt/src/ndarray.rs @@ -20,16 +20,16 @@ //! This module implements the [`NDArray`] type for working with *TVM tensors* or //! coverting from a Rust's ndarray to TVM `NDArray`. //! -//! One can create an empty NDArray given the shape, context and dtype using [`empty`]. +//! One can create an empty NDArray given the shape, device and dtype using [`empty`]. //! To create an NDArray from a mutable buffer in cpu use [`copy_from_buffer`]. -//! To copy an NDArray to different context use [`copy_to_ctx`]. +//! To copy an NDArray to different device use [`copy_to_device`]. //! //! Given a [`Rust's dynamic ndarray`], one can convert it to TVM NDArray as follows: //! //! # Example //! //! ``` -//! # use tvm_rt::{NDArray, Context, DataType}; +//! # use tvm_rt::{NDArray, DataType, Device}; //! # use ndarray::{Array, ArrayD}; //! # use std::str::FromStr; //! use std::convert::TryFrom; @@ -37,7 +37,7 @@ //! let a = Array::from_shape_vec((2, 2), vec![1f32, 2., 3., 4.]) //! .unwrap() //! .into_dyn(); // Rust's ndarray -//! let nd = NDArray::from_rust_ndarray(&a, Context::cpu(0), DataType::from_str("float32").unwrap()).unwrap(); +//! let nd = NDArray::from_rust_ndarray(&a, Device::cpu(0), DataType::from_str("float32").unwrap()).unwrap(); //! assert_eq!(nd.shape(), &[2, 2]); //! let rnd: ArrayD = ArrayD::try_from(&nd).unwrap(); //! assert!(rnd.all_close(&a, 1e-8f32)); @@ -45,7 +45,7 @@ //! //! [`Rust's dynamic ndarray`]:https://docs.rs/ndarray/0.12.1/ndarray/ //! [`copy_from_buffer`]:struct.NDArray.html#method.copy_from_buffer -//! [`copy_to_ctx`]:struct.NDArray.html#method.copy_to_ctx +//! [`copy_to_device`]:struct.NDArray.html#method.copy_to_device use std::ffi::c_void; use std::{borrow::Cow, convert::TryInto}; @@ -54,7 +54,7 @@ use std::{convert::TryFrom, mem, os::raw::c_int, ptr, slice, str::FromStr}; use mem::size_of; use tvm_macros::Object; use tvm_sys::ffi::DLTensor; -use tvm_sys::{ffi, ByteArray, Context, DataType}; +use tvm_sys::{ffi, ByteArray, DataType, Device}; use ndarray::{Array, ArrayD}; use num_traits::Num; @@ -189,9 +189,9 @@ impl NDArray { self.len() * self.dtype().itemsize() } - /// Returns the context which the NDArray was defined. - pub fn ctx(&self) -> Context { - self.as_dltensor().ctx.into() + /// Returns the device which the NDArray was defined. + pub fn device(&self) -> Device { + self.as_dltensor().device.into() } /// Returns the type of the entries of the NDArray. @@ -239,12 +239,12 @@ impl NDArray { /// ## Example /// /// ``` - /// # use tvm_rt::{Context, DataType, NDArray}; + /// # use tvm_rt::{Device, DataType, NDArray}; /// # use std::str::FromStr; /// let mut shape = [4]; /// let mut data = vec![1i32, 2, 3, 4]; - /// let ctx = Context::cpu(0); - /// let mut ndarray = NDArray::empty(&mut shape, ctx, DataType::from_str("int32").unwrap()); + /// let dev = Device::cpu(0); + /// let mut ndarray = NDArray::empty(&mut shape, dev, DataType::from_str("int32").unwrap()); /// ndarray.copy_from_buffer(&mut data); /// assert_eq!(ndarray.shape(), shape); /// assert_eq!(ndarray.to_vec::().unwrap(), data); @@ -272,12 +272,12 @@ impl NDArray { /// ## Example /// /// ``` - /// # use tvm_rt::{Context, DataType, NDArray}; + /// # use tvm_rt::{Device, DataType, NDArray}; /// # use std::str::FromStr; /// let shape = &mut [2]; /// let mut data = vec![1f32, 2.0]; - /// let ctx = Context::cpu(0); - /// let mut ndarray = NDArray::empty(shape, ctx, DataType::from_str("int32").unwrap()); + /// let dev = Device::cpu(0); + /// let mut ndarray = NDArray::empty(shape, dev, DataType::from_str("int32").unwrap()); /// ndarray.copy_from_buffer(&mut data); /// ``` /// @@ -332,8 +332,8 @@ impl NDArray { Ok(target) } - /// Copies the NDArray to a target context. - pub fn copy_to_ctx(&self, target: &Context) -> Result { + /// Copies the NDArray to a target device. + pub fn copy_to_device(&self, target: &Device) -> Result { let tmp = NDArray::empty(self.shape(), *target, self.dtype()); let copy = self.copy_to_ndarray(tmp)?; Ok(copy) @@ -342,17 +342,17 @@ impl NDArray { /// Converts a Rust's ndarray to TVM NDArray. pub fn from_rust_ndarray( input_nd: &ArrayD, - ctx: Context, + dev: Device, dtype: DataType, ) -> Result { let shape: Vec = input_nd.shape().iter().map(|&x| x as i64).collect(); - let mut nd = NDArray::empty(&shape, ctx, dtype); + let mut nd = NDArray::empty(&shape, dev, dtype); nd.fill_from_iter(input_nd.iter().copied()); Ok(nd) } - /// Allocates and creates an empty NDArray given the shape, context and dtype. - pub fn empty(shape: &[i64], ctx: Context, dtype: DataType) -> NDArray { + /// Allocates and creates an empty NDArray given the shape, device and dtype. + pub fn empty(shape: &[i64], dev: Device, dtype: DataType) -> NDArray { let mut handle = ptr::null_mut() as ffi::TVMArrayHandle; let dtype: tvm_sys::ffi::DLDataType = dtype.into(); check_call!(ffi::TVMArrayAlloc( @@ -361,8 +361,8 @@ impl NDArray { i32::from(dtype.code) as c_int, i32::from(dtype.bits) as c_int, i32::from(dtype.lanes) as c_int, - ctx.device_type as c_int, - ctx.device_id as c_int, + dev.device_type as c_int, + dev.device_id as c_int, &mut handle as *mut _, )); let ptr = NDArrayContainer::from_raw(handle) @@ -441,9 +441,9 @@ mod tests { #[test] fn basics() { let shape = &[1, 2, 3]; - let ctx = Context::cpu(0); + let dev = Device::cpu(0); println!("before empty"); - let ndarray = NDArray::empty(shape, ctx, DataType::from_str("int32").unwrap()); + let ndarray = NDArray::empty(shape, dev, DataType::from_str("int32").unwrap()); println!("after empty"); assert_eq!(ndarray.shape(), shape); assert_eq!(ndarray.len(), shape.iter().product::() as usize); @@ -456,8 +456,8 @@ mod tests { fn copy() { let shape = &[4]; let data = vec![1i32, 2, 3, 4]; - let ctx = Context::cpu(0); - let mut ndarray = NDArray::empty(shape, ctx, DataType::int(32, 1)).zeroed(); + let dev = Device::cpu(0); + let mut ndarray = NDArray::empty(shape, dev, DataType::int(32, 1)).zeroed(); assert_eq!(ndarray.to_vec::().unwrap(), vec![0, 0, 0, 0]); ndarray.copy_from_buffer(&data); assert_eq!(ndarray.shape(), shape); @@ -466,11 +466,7 @@ mod tests { assert!(ndarray.is_contiguous()); assert_eq!(ndarray.byte_offset(), 0); let shape = vec![4]; - let e = NDArray::empty( - &shape, - Context::cpu(0), - DataType::from_str("int32").unwrap(), - ); + let e = NDArray::empty(&shape, Device::cpu(0), DataType::from_str("int32").unwrap()); let nd = ndarray.copy_to_ndarray(e); assert!(nd.is_ok()); assert_eq!(nd.unwrap().to_vec::().unwrap(), data); @@ -482,10 +478,10 @@ mod tests { fn copy_wrong_dtype() { let shape = vec![4]; let mut data = vec![1f32, 2., 3., 4.]; - let ctx = Context::cpu(0); - let mut nd_float = NDArray::empty(&shape, ctx, DataType::from_str("float32").unwrap()); + let dev = Device::cpu(0); + let mut nd_float = NDArray::empty(&shape, dev, DataType::from_str("float32").unwrap()); nd_float.copy_from_buffer(&mut data); - let empty_int = NDArray::empty(&shape, ctx, DataType::from_str("int32").unwrap()); + let empty_int = NDArray::empty(&shape, dev, DataType::from_str("int32").unwrap()); nd_float.copy_to_ndarray(empty_int).unwrap(); } @@ -495,7 +491,7 @@ mod tests { .unwrap() .into_dyn(); let nd = - NDArray::from_rust_ndarray(&a, Context::cpu(0), DataType::from_str("float32").unwrap()) + NDArray::from_rust_ndarray(&a, Device::cpu(0), DataType::from_str("float32").unwrap()) .unwrap(); assert_eq!(nd.shape(), &[2, 2]); let rnd: ArrayD = ArrayD::try_from(&nd).unwrap(); diff --git a/rust/tvm-sys/src/array.rs b/rust/tvm-sys/src/array.rs index 5d09d8670eda3..92208303e89ce 100644 --- a/rust/tvm-sys/src/array.rs +++ b/rust/tvm-sys/src/array.rs @@ -23,7 +23,7 @@ use std::{ }; use crate::ffi::{ - DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt, + DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt, DLDevice, DLDeviceType_kDLCPU, DLTensor, }; @@ -35,7 +35,7 @@ macro_rules! impl_dltensor_from_ndarray { fn from(arr: &'a mut ndarray::Array<$type, D>) -> Self { DLTensor { data: arr.as_mut_ptr() as *mut c_void, - ctx: DLContext { + device: DLDevice { device_type: DLDeviceType_kDLCPU, device_id: 0, }, diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/device.rs similarity index 71% rename from rust/tvm-sys/src/context.rs rename to rust/tvm-sys/src/device.rs index a5165fccf0aab..910cc59734088 100644 --- a/rust/tvm-sys/src/context.rs +++ b/rust/tvm-sys/src/device.rs @@ -17,25 +17,25 @@ * under the License. */ -//! Provides [`Context`] and related device queries. +//! Provides [`Device`] and related device queries. //! -//! Create a new context for device type and device id. +//! Create a new device for device type and device id. //! //! # Example //! //! ``` -//! # use tvm_sys::{DeviceType, Context}; +//! # use tvm_sys::{DeviceType, Device}; //! let cpu = DeviceType::from("cpu"); -//! let ctx = Context::new(cpu , 0); -//! let cpu0 = Context::cpu(0); -//! assert_eq!(ctx, cpu0); +//! let dev = Device::new(cpu , 0); +//! let cpu0 = Device::cpu(0); +//! assert_eq!(dev, cpu0); //! ``` //! //! Or from a supported device name. //! //! ``` -//! use tvm_sys::Context; -//! let cpu0 = Context::from("cpu"); +//! use tvm_sys::Device; +//! let cpu0 = Device::from("cpu"); //! println!("{}", cpu0); //! ``` @@ -141,30 +141,30 @@ impl<'a> From<&DeviceType> for ArgValue<'a> { } #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] -pub struct Context { +pub struct Device { pub device_type: DeviceType, pub device_id: usize, } -impl Context { - pub fn new(device_type: DeviceType, device_id: usize) -> Context { - Context { +impl Device { + pub fn new(device_type: DeviceType, device_id: usize) -> Device { + Device { device_type, device_id, } } } -impl<'a> From<&'a Context> for DLContext { - fn from(ctx: &'a Context) -> Self { +impl<'a> From<&'a Device> for DLDevice { + fn from(dev: &'a Device) -> Self { Self { - device_type: ctx.device_type.into(), - device_id: ctx.device_id as i32, + device_type: dev.device_type.into(), + device_id: dev.device_id as i32, } } } -impl Default for Context { +impl Default for Device { fn default() -> Self { Self { device_type: DLDeviceType_kDLCPU.into(), @@ -177,10 +177,10 @@ impl Default for Context { #[error("unsupported device: {0}")] pub struct UnsupportedDeviceError(String); -macro_rules! impl_tvm_context { +macro_rules! impl_tvm_device { ( $( $dev_type:ident : [ $( $dev_name:ident ),+ ] ),+ ) => { - /// Creates a Context from a string (e.g., "cpu", "gpu", "ext_dev") - impl FromStr for Context { + /// Creates a Device from a string (e.g., "cpu", "gpu", "ext_dev") + impl FromStr for Device { type Err = UnsupportedDeviceError; fn from_str(type_str: &str) -> Result { Ok(Self { @@ -193,7 +193,7 @@ macro_rules! impl_tvm_context { } } - impl Context { + impl Device { $( $( pub fn $dev_name(device_id: usize) -> Self { @@ -208,7 +208,7 @@ macro_rules! impl_tvm_context { }; } -impl_tvm_context!( +impl_tvm_device!( DLDeviceType_kDLCPU: [cpu, llvm, stackvm], DLDeviceType_kDLGPU: [gpu, cuda, nvptx], DLDeviceType_kDLOpenCL: [cl], @@ -218,59 +218,59 @@ impl_tvm_context!( DLDeviceType_kDLExtDev: [ext_dev] ); -impl<'a> From<&'a str> for Context { +impl<'a> From<&'a str> for Device { fn from(target: &str) -> Self { - Context::new(DeviceType::from(target), 0) + Device::new(DeviceType::from(target), 0) } } -impl From for Context { - fn from(ctx: ffi::DLContext) -> Self { - Context { - device_type: DeviceType::from(ctx.device_type), - device_id: ctx.device_id as usize, +impl From for Device { + fn from(dev: ffi::DLDevice) -> Self { + Device { + device_type: DeviceType::from(dev.device_type), + device_id: dev.device_id as usize, } } } -impl From for ffi::DLContext { - fn from(ctx: Context) -> Self { - ffi::DLContext { - device_type: ctx.device_type.into(), - device_id: ctx.device_id as i32, +impl From for ffi::DLDevice { + fn from(dev: Device) -> Self { + ffi::DLDevice { + device_type: dev.device_type.into(), + device_id: dev.device_id as i32, } } } -impl Display for Context { +impl Display for Device { fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "{}({})", self.device_type, self.device_id) } } -impl<'a> From<&'a Context> for ArgValue<'a> { - fn from(ctx: &'a Context) -> Self { - DLContext::from(ctx).into() +impl<'a> From<&'a Device> for ArgValue<'a> { + fn from(dev: &'a Device) -> Self { + DLDevice::from(dev).into() } } -impl<'a> From for ArgValue<'a> { - fn from(ctx: Context) -> Self { - DLContext::from(ctx).into() +impl<'a> From for ArgValue<'a> { + fn from(dev: Device) -> Self { + DLDevice::from(dev).into() } } -impl From for RetValue { - fn from(ret_value: Context) -> RetValue { - RetValue::Context(ret_value.into()) +impl From for RetValue { + fn from(ret_value: Device) -> RetValue { + RetValue::Device(ret_value.into()) } } -impl TryFrom for Context { +impl TryFrom for Device { type Error = anyhow::Error; - fn try_from(ret_value: RetValue) -> anyhow::Result { + fn try_from(ret_value: RetValue) -> anyhow::Result { match ret_value { - RetValue::Context(dt) => Ok(dt.into()), + RetValue::Device(dt) => Ok(dt.into()), // TODO(@jroesch): improve _ => Err(anyhow::anyhow!("unable to convert datatype from ...")), } @@ -282,15 +282,15 @@ mod tests { use super::*; #[test] - fn context() { - let ctx = Context::cpu(0); - println!("ctx: {}", ctx); - let default_ctx = Context::new(DeviceType::CPU, 0); - assert_eq!(ctx.clone(), default_ctx); - assert_ne!(ctx, Context::gpu(0)); + fn device() { + let dev = Device::cpu(0); + println!("device: {}", dev); + let default_dev = Device::new(DeviceType::CPU, 0); + assert_eq!(dev.clone(), default_dev); + assert_ne!(dev, Device::gpu(0)); - let str_ctx = Context::new(DeviceType::GPU, 0); - assert_eq!(str_ctx.clone(), str_ctx); - assert_ne!(str_ctx, Context::new(DeviceType::CPU, 0)); + let str_dev = Device::new(DeviceType::GPU, 0); + assert_eq!(str_dev.clone(), str_dev); + assert_ne!(str_dev, Device::new(DeviceType::CPU, 0)); } } diff --git a/rust/tvm-sys/src/lib.rs b/rust/tvm-sys/src/lib.rs index 231569ba682e7..8ed6f37f5f48f 100644 --- a/rust/tvm-sys/src/lib.rs +++ b/rust/tvm-sys/src/lib.rs @@ -45,16 +45,16 @@ pub mod ffi { pub mod array; pub mod byte_array; -pub mod context; pub mod datatype; +pub mod device; pub mod errors; #[macro_use] pub mod packed_func; pub mod value; pub use byte_array::ByteArray; -pub use context::{Context, DeviceType}; pub use datatype::DataType; +pub use device::{Device, DeviceType}; pub use errors::*; pub use packed_func::{ArgValue, RetValue}; diff --git a/rust/tvm-sys/src/packed_func.rs b/rust/tvm-sys/src/packed_func.rs index 7b8d5296d6414..6f43b786780a1 100644 --- a/rust/tvm-sys/src/packed_func.rs +++ b/rust/tvm-sys/src/packed_func.rs @@ -76,7 +76,7 @@ macro_rules! TVMPODValue { Null, DataType(DLDataType), String(*mut c_char), - Context(TVMContext), + Device(DLDevice), Handle(*mut c_void), ArrayHandle(TVMArrayHandle), ObjectHandle(*mut c_void), @@ -97,7 +97,7 @@ macro_rules! TVMPODValue { DLDataTypeCode_kDLFloat => Float($value.v_float64), TVMArgTypeCode_kTVMNullptr => Null, TVMArgTypeCode_kTVMDataType => DataType($value.v_type), - TVMArgTypeCode_kTVMContext => Context($value.v_ctx), + TVMArgTypeCode_kDLDevice => Device($value.v_device), TVMArgTypeCode_kTVMOpaqueHandle => Handle($value.v_handle), TVMArgTypeCode_kTVMDLTensorHandle => ArrayHandle($value.v_handle as TVMArrayHandle), TVMArgTypeCode_kTVMObjectHandle => ObjectHandle($value.v_handle), @@ -119,7 +119,7 @@ macro_rules! TVMPODValue { Float(val) => (TVMValue { v_float64: *val }, DLDataTypeCode_kDLFloat), Null => (TVMValue{ v_int64: 0 },TVMArgTypeCode_kTVMNullptr), DataType(val) => (TVMValue { v_type: *val }, TVMArgTypeCode_kTVMDataType), - Context(val) => (TVMValue { v_ctx: val.clone() }, TVMArgTypeCode_kTVMContext), + Device(val) => (TVMValue { v_device: val.clone() }, TVMArgTypeCode_kDLDevice), String(val) => { ( TVMValue { v_handle: *val as *mut c_void }, @@ -264,7 +264,7 @@ impl_pod_value!(Int, i64, [i8, i16, i32, i64, isize]); impl_pod_value!(UInt, i64, [u8, u16, u32, u64, usize]); impl_pod_value!(Float, f64, [f32, f64]); impl_pod_value!(DataType, DLDataType, [DLDataType]); -impl_pod_value!(Context, TVMContext, [TVMContext]); +impl_pod_value!(Device, DLDevice, [DLDevice]); impl<'a> From<&'a str> for ArgValue<'a> { fn from(s: &'a str) -> Self { diff --git a/rust/tvm-sys/src/value.rs b/rust/tvm-sys/src/value.rs index a9ad5f523fdef..f939d5177806d 100644 --- a/rust/tvm-sys/src/value.rs +++ b/rust/tvm-sys/src/value.rs @@ -47,16 +47,16 @@ macro_rules! impl_pod_tvm_value { impl_pod_tvm_value!(v_int64, i64, i8, u8, i16, u16, i32, u32, i64, u64, isize, usize); impl_pod_tvm_value!(v_float64, f64, f32, f64); impl_pod_tvm_value!(v_type, DLDataType); -impl_pod_tvm_value!(v_ctx, TVMContext); +impl_pod_tvm_value!(v_device, DLDevice); #[derive(Debug, Error)] #[error("unsupported device: {0}")] pub struct UnsupportedDeviceError(String); -macro_rules! impl_tvm_context { +macro_rules! impl_tvm_device { ( $( $dev_type:ident : [ $( $dev_name:ident ),+ ] ),+ ) => { - /// Creates a TVMContext from a string (e.g., "cpu", "gpu", "ext_dev") - impl FromStr for TVMContext { + /// Creates a DLDevice from a string (e.g., "cpu", "gpu", "ext_dev") + impl FromStr for DLDevice { type Err = UnsupportedDeviceError; fn from_str(type_str: &str) -> Result { Ok(Self { @@ -69,7 +69,7 @@ macro_rules! impl_tvm_context { } } - impl TVMContext { + impl DLDevice { $( $( pub fn $dev_name(device_id: usize) -> Self { @@ -84,7 +84,7 @@ macro_rules! impl_tvm_context { }; } -impl_tvm_context!( +impl_tvm_device!( DLDeviceType_kDLCPU: [cpu, llvm, stackvm], DLDeviceType_kDLGPU: [gpu, cuda, nvptx], DLDeviceType_kDLOpenCL: [cl], diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py index fdacb5bb1fca6..6c99dee22bf54 100644 --- a/rust/tvm/examples/resnet/src/build_resnet.py +++ b/rust/tvm/examples/resnet/src/build_resnet.py @@ -49,7 +49,7 @@ default=3, help="level of optimization. 0 is unoptimized and 3 is the highest level", ) -aa("--target", type=str, default="llvm", help="target context for compilation") +aa("--target", type=str, default="llvm", help="target for compilation") aa("--image-shape", type=str, default="3,224,224", help="input image dimensions") aa("--image-name", type=str, default="cat.png", help="name of input image to download") args = parser.parse_args() @@ -140,8 +140,8 @@ def test_build(build_dir): lib = tvm.runtime.load_module(osp.join(build_dir, "deploy_lib.so")) params = bytearray(open(osp.join(build_dir, "deploy_param.params"), "rb").read()) input_data = get_cat_image() - ctx = tvm.cpu() - module = graph_runtime.create(graph, lib, ctx) + dev = tvm.cpu() + module = graph_runtime.create(graph, lib, dev) module.load_params(params) module.run(data=input_data) out = module.get_output(0).asnumpy() diff --git a/rust/tvm/examples/resnet/src/main.rs b/rust/tvm/examples/resnet/src/main.rs index f24c358ab52a8..7f5fcd458c26c 100644 --- a/rust/tvm/examples/resnet/src/main.rs +++ b/rust/tvm/examples/resnet/src/main.rs @@ -31,7 +31,7 @@ use tvm::runtime::graph_rt::GraphRt; use tvm::*; fn main() -> anyhow::Result<()> { - let ctx = Context::cpu(0); + let dev = Device::cpu(0); println!("{}", concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png")); let img = image::open(concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png")) @@ -61,7 +61,7 @@ fn main() -> anyhow::Result<()> { // make arr shape as [1, 3, 224, 224] acceptable to resnet let arr = arr.insert_axis(Axis(0)); // create input tensor from rust's ndarray - let input = NDArray::from_rust_ndarray(&arr, Context::cpu(0), DataType::float(32, 1))?; + let input = NDArray::from_rust_ndarray(&arr, Device::cpu(0), DataType::float(32, 1))?; println!( "input shape is {:?}, len: {}, size: {}", input.shape(), @@ -78,7 +78,7 @@ fn main() -> anyhow::Result<()> { "/deploy_lib.so" )))?; - let mut graph_rt = GraphRt::create_from_parts(&graph, lib, ctx)?; + let mut graph_rt = GraphRt::create_from_parts(&graph, lib, dev)?; // parse parameters and convert to TVMByteArray let params: Vec = fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/deploy_param.params"))?; @@ -91,7 +91,7 @@ fn main() -> anyhow::Result<()> { // prepare to get the output let output_shape = &[1, 1000]; - let output = NDArray::empty(output_shape, Context::cpu(0), DataType::float(32, 1)); + let output = NDArray::empty(output_shape, Device::cpu(0), DataType::float(32, 1)); graph_rt.get_output_into(0, output.clone())?; // flatten the output as Vec diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs index 182ffd4d9081b..91e2211312167 100644 --- a/rust/tvm/src/ir/diagnostics/mod.rs +++ b/rust/tvm/src/ir/diagnostics/mod.rs @@ -51,7 +51,7 @@ external! { fn diagnostic_context_render(ctx: DiagnosticContext) -> (); #[name("diagnostics.DiagnosticRendererRender")] - fn diagnositc_renderer_render(renderer: DiagnosticRenderer,ctx: DiagnosticContext) -> (); + fn diagnositc_renderer_render(renderer: DiagnosticRenderer, ctx: DiagnosticContext) -> (); #[name("diagnostics.ClearRenderer")] fn clear_renderer() -> (); diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs index caae07775d21e..047b2cf220e08 100644 --- a/rust/tvm/src/lib.rs +++ b/rust/tvm/src/lib.rs @@ -32,9 +32,9 @@ pub use crate::{errors::*, function::Function, module::Module, ndarray::NDArray}; -pub use tvm_rt::{Context, DataType, DeviceType}; +pub use tvm_rt::{DataType, Device, DeviceType}; -pub use tvm_rt::context; +pub use tvm_rt::device; pub use tvm_rt::errors; pub use tvm_rt::function; pub use tvm_rt::module; diff --git a/rust/tvm/src/runtime/graph_rt.rs b/rust/tvm/src/runtime/graph_rt.rs index fcc41aca560fc..cba2875c99522 100644 --- a/rust/tvm/src/runtime/graph_rt.rs +++ b/rust/tvm/src/runtime/graph_rt.rs @@ -20,7 +20,7 @@ use std::convert::TryInto; use crate::runtime::Function; -use crate::{runtime::function::Result, runtime::ByteArray, Context, Module, NDArray}; +use crate::{runtime::function::Result, runtime::ByteArray, Device, Module, NDArray}; /// An instance of the C++ graph runtime. /// @@ -35,25 +35,25 @@ pub struct GraphRt { impl GraphRt { /// Create a graph runtime directly from a runtime module. - pub fn from_module(module: Module, ctx: Context) -> Result { - let default: Box Result> = + pub fn from_module(module: Module, dev: Device) -> Result { + let default: Box Result> = module.get_function("default", false)?.into(); Ok(Self { - module: default(ctx)?, + module: default(dev)?, }) } - /// Create a graph runtime from the deprecated graph, lib, ctx triple. - pub fn create_from_parts(graph: &str, lib: Module, ctx: Context) -> Result { + /// Create a graph runtime from the deprecated graph, lib, dev triple. + pub fn create_from_parts(graph: &str, lib: Module, dev: Device) -> Result { let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap(); let runtime_create_fn_ret = runtime_create_fn.invoke(vec![ graph.into(), lib.into(), - (&ctx.device_type).into(), + (&dev.device_type).into(), // NOTE you must pass the device id in as i32 because that's what TVM expects - (ctx.device_id as i32).into(), + (dev.device_id as i32).into(), ]); let graph_runtime_module: Module = runtime_create_fn_ret?.try_into()?; Ok(Self { diff --git a/rust/tvm/tests/basics/src/main.rs b/rust/tvm/tests/basics/src/main.rs index 450ab48dc1b23..d1cfbcf78dc78 100644 --- a/rust/tvm/tests/basics/src/main.rs +++ b/rust/tvm/tests/basics/src/main.rs @@ -25,18 +25,18 @@ fn main() { let shape = &mut [2]; let mut data = vec![3f32, 4.0]; - let (ctx, ctx_name) = if cfg!(feature = "cpu") { - (Context::cpu(0), "cpu") + let (dev, dev_name) = if cfg!(feature = "cpu") { + (Device::cpu(0), "cpu") } else { - (Context::gpu(0), "gpu") + (Device::gpu(0), "gpu") }; let dtype = DataType::from_str("float32").unwrap(); - let mut arr = NDArray::empty(shape, ctx, dtype); + let mut arr = NDArray::empty(shape, dev, dtype); arr.copy_from_buffer(data.as_mut_slice()); - let ret = NDArray::empty(shape, ctx, dtype); + let ret = NDArray::empty(shape, dev, dtype); let mut fadd = Module::load(&concat!(env!("OUT_DIR"), "/test_add.so")).unwrap(); - if !fadd.enabled(ctx_name) { + if !fadd.enabled(dev_name) { return; } diff --git a/rust/tvm/tests/callback/src/bin/array.rs b/rust/tvm/tests/callback/src/bin/array.rs index 2f1848ec6471a..81ee426d39671 100644 --- a/rust/tvm/tests/callback/src/bin/array.rs +++ b/rust/tvm/tests/callback/src/bin/array.rs @@ -47,7 +47,7 @@ fn main() { let shape = &[2]; let data = vec![3.0, 4.0]; - let mut arr = NDArray::empty(shape, Context::cpu(0), DataType::float(32, 1)); + let mut arr = NDArray::empty(shape, Device::cpu(0), DataType::float(32, 1)); arr.copy_from_buffer(data.as_slice()); register_untyped(sum, "sum", true).unwrap(); diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index f25e581dbf24a..58bdb6ca8359f 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -57,24 +57,24 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target if (device_type == kDLCPU) { return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0); } else if (device_type == kDLGPU || device_type == kDLROCM) { - auto ctx = TVMContext{static_cast(device_type), 0}; + auto dev = Device{static_cast(device_type), 0}; auto device_name = device_type == kDLGPU ? "device_api.gpu" : "device_api.rocm"; auto func = tvm::runtime::Registry::Get(device_name); ICHECK(func != nullptr) << "Cannot find GPU device_api in registry"; auto device_api = static_cast(((*func)()).operator void*()); tvm::runtime::TVMRetValue ret; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); int max_shared_memory_per_block = ret; // There is no explicit local memory limition in CUDA runtime, // so we can use INT32_MAX to disalbe the check on local_memory. int max_local_memory_per_block = INT32_MAX; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); int max_threads_per_block = ret; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); int warp_size = ret; int max_vthread_extent = warp_size / 4; @@ -107,22 +107,22 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device; } } else if (device_type == kDLVulkan) { - auto ctx = TVMContext{static_cast(device_type), 0}; + auto dev = Device{static_cast(device_type), 0}; auto device_name = "device_api.vulkan"; auto func = tvm::runtime::Registry::Get(device_name); ICHECK(func != nullptr) << "Cannot find Vulkan device_api in registry"; auto device_api = static_cast(((*func)()).operator void*()); tvm::runtime::TVMRetValue ret; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); int max_shared_memory_per_block = ret; int max_local_memory_per_block = INT32_MAX; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); int max_threads_per_block = ret; - device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); + device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); int warp_size = ret; int max_vthread_extent = std::max(1, warp_size / 4); diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc index 5c119b64b93de..c816119d0fad0 100644 --- a/src/contrib/tf_op/tvm_dso_op_kernels.cc +++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc @@ -154,7 +154,7 @@ void EnsureAlignment(OpKernelContext* ctx, const tensorflow::Tensor& tensor, Ten } // Create DLPack tensor from TensorFlow tensor -tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape, +tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLDevice& dev, int64_t* tf_shape, DLTensor* out) { DLDataType dlpack_type; const tensorflow::Tensor& tensor = *src.tensor; @@ -163,7 +163,7 @@ tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, in if (!status.ok()) { return status; } - out->ctx = ctx; + out->device = dev; out->ndim = tensor.shape().dims(); out->shape = tf_shape; out->strides = nullptr; @@ -256,7 +256,7 @@ class TVMDSOOp : public OpKernel { int device_id = TVMDSOOpTrait::device_id(context); int device_type = TVMDSOOpTrait::device_type; - DLContext dl_ctx = {DLDeviceType(device_type), device_id}; + DLDevice dl_dev = {DLDeviceType(device_type), device_id}; // Get output shape tensorflow::TensorShape output_shape; @@ -287,7 +287,7 @@ class TVMDSOOp : public OpKernel { EnsureAlignment(context, input_tensor, &input); input.CopyFromOrigin(); - status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]); + status = MakeDLTensor(input, dl_dev, shape_ptr, &args[i]); OP_REQUIRES_OK(context, status); } @@ -302,7 +302,7 @@ class TVMDSOOp : public OpKernel { output.device_type = device_type; EnsureAlignment(context, *output_tensor, &output); - status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[num_inputs]); + status = MakeDLTensor(output, dl_dev, output_shape_ptr, &args[num_inputs]); OP_REQUIRES_OK(context, status); // Prepare PackedFunc arguments diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index efedd1b99d6d0..23911efe39c5f 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -328,7 +328,7 @@ struct NDArrayContainerTrait { static constexpr const std::nullptr_t VisitAttrs = nullptr; static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) { - ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK_EQ(key->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor"; ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor"; hash_reduce(runtime::DataType(key->dl_tensor.dtype)); hash_reduce(key->dl_tensor.ndim); @@ -345,8 +345,8 @@ struct NDArrayContainerTrait { auto ldt = lhs->dl_tensor.dtype; auto rdt = rhs->dl_tensor.dtype; - ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; - ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK_EQ(lhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK_EQ(rhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor"; ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor"; ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor"; diff --git a/src/parser/parser.cc b/src/parser/parser.cc index c7d8e025848a3..b72a632635d93 100644 --- a/src/parser/parser.cc +++ b/src/parser/parser.cc @@ -523,18 +523,18 @@ class Parser { /*! \brief Convert a numeric token to an NDArray for embedding into the Relay program. */ NDArray NumberToNDArray(const Token& token) { if (token->token_type == TokenType::kInteger) { - DLContext ctx = {DLDeviceType::kDLCPU, 0}; + DLDevice dev = {DLDeviceType::kDLCPU, 0}; auto dtype = String2DLDataType("int32"); - auto data = NDArray::Empty({}, dtype, ctx); + auto data = NDArray::Empty({}, dtype, dev); auto array = reinterpret_cast(data->data); // revisit this, literal node issue. int64_t value = Downcast(token->data); array[0] = (int32_t)value; return data; } else if (token->token_type == TokenType::kFloat) { - DLContext ctx = {DLDeviceType::kDLCPU, 0}; + DLDevice dev = {DLDeviceType::kDLCPU, 0}; auto float_imm = Downcast(token->data); - auto data = NDArray::Empty({}, float_imm->dtype, ctx); + auto data = NDArray::Empty({}, float_imm->dtype, dev); auto array = reinterpret_cast(data->data); // revisit this, literal node issue. // TODO(@jroesch): bounds checking @@ -549,9 +549,9 @@ class Parser { /*! \brief Convert a boolean value to an NDArray for embedding into the Relay program. */ NDArray BooleanToNDarray(bool value) { - DLContext ctx = {DLDeviceType::kDLCPU, 0}; + DLDevice dev = {DLDeviceType::kDLCPU, 0}; auto dtype = String2DLDataType("bool"); - auto data = NDArray::Empty({}, dtype, ctx); + auto data = NDArray::Empty({}, dtype, dev); auto array = reinterpret_cast(data->data); array[0] = value; return data; diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc index cbee04f960967..31f98ce4d2705 100644 --- a/src/printer/relay_text_printer.cc +++ b/src/printer/relay_text_printer.cc @@ -322,7 +322,7 @@ Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) { if (op->is_scalar()) { std::ostringstream os; DataType dtype = DataType(op->data->dtype); - ICHECK_EQ(op->data->ctx.device_type, kDLCPU); + ICHECK_EQ(op->data->device.device_type, kDLCPU); if (dtype == DataType::Int(32)) { return ScalarLiteral(dtype, static_cast(op->data->data)[0]); } else if (dtype == DataType::Int(64)) { diff --git a/src/relay/analysis/context_analysis.cc b/src/relay/analysis/context_analysis.cc index a648b7af8fd35..970ceda070dff 100644 --- a/src/relay/analysis/context_analysis.cc +++ b/src/relay/analysis/context_analysis.cc @@ -67,7 +67,7 @@ namespace relay { using PackedAnalysisResultMap = Map>; using AnalysisResultMap = - std::unordered_map; + std::unordered_map; namespace analysis { @@ -90,21 +90,22 @@ class DeviceDomain { public: // Construct an empty domain. DeviceDomain() { - ctx_.device_type = static_cast(-1); - ctx_.device_id = -1; + device_.device_type = static_cast(-1); + device_.device_id = -1; } // Construct a domain based on a given context. - explicit DeviceDomain(const TVMContext& ctx) : ctx_(ctx) {} + explicit DeviceDomain(const Device& dev) : device_(dev) {} // Check if the current domain is empty. bool IsEmptyDomain() const { - return static_cast(ctx_.device_type) == -1 && ctx_.device_id == -1; + return static_cast(device_.device_type) == -1 && device_.device_id == -1; } // Check if the current domain equals the other one. bool operator==(const DeviceDomain& other) const { - return ctx_.device_type == other.ctx_.device_type && ctx_.device_id == other.ctx_.device_id; + return device_.device_type == other.device_.device_type && + device_.device_id == other.device_.device_id; } bool operator!=(const DeviceDomain& other) const { return !(*this == other); } @@ -116,8 +117,8 @@ class DeviceDomain { if (domain->IsEmptyDomain()) { return (size_t)(domain.get()); } else { - size_t const h1(std::hash()(static_cast(domain->ctx_.device_type))); - size_t const h2(std::hash()(domain->ctx_.device_id)); + size_t const h1(std::hash()(static_cast(domain->device_.device_type))); + size_t const h2(std::hash()(domain->device_.device_id)); return h1 ^ (h2 << 1); } } @@ -136,7 +137,7 @@ class DeviceDomain { }; /* \brief The device to be assigned to the current domain. */ - TVMContext ctx_; + Device device_; friend DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs); friend class ContextAnalyzer; @@ -163,13 +164,13 @@ DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) { class ContextAnalyzer : public MixedModeVisitor { public: ContextAnalyzer(const IRModule& mod, const GlobalVar& current_func, - const TVMContext& default_context) + const Device& default_device) : MixedModeVisitor(9), // the number of repeated visits a node can perform mod_(mod), current_func_(current_func), - default_context_(default_context) { - cpu_ctx_.device_type = kDLCPU; - cpu_ctx_.device_id = 0; + default_device_(default_device) { + cpu_dev_.device_type = kDLCPU; + cpu_dev_.device_id = 0; } // Create an empty domain. @@ -177,8 +178,8 @@ class ContextAnalyzer : public MixedModeVisitor { DeviceDomainPtr Bottom() { return std::make_shared(DeviceDomain()); } // Create a domain with the given device context. - DeviceDomainPtr DeviceType(const TVMContext& ctx) { - return std::make_shared(DeviceDomain(ctx)); + DeviceDomainPtr DeviceType(const Device& dev) { + return std::make_shared(DeviceDomain(dev)); } // Find the root of a device. @@ -233,19 +234,19 @@ class ContextAnalyzer : public MixedModeVisitor { // attribute of other nodes can be propagated from it. void UnifyDeviceCopy(const std::vector& inps, const std::vector& outputs, DLDeviceType src_dev_type, DLDeviceType dst_dev_type) { - TVMContext src_ctx; - src_ctx.device_type = src_dev_type; - src_ctx.device_id = 0; - auto src_domain = DeviceType(src_ctx); + Device src_dev; + src_dev.device_type = src_dev_type; + src_dev.device_id = 0; + auto src_domain = DeviceType(src_dev); for (const auto& it : inps) { auto lhs = DeviceFor(it); Unify(lhs, src_domain); } - TVMContext dst_ctx; - dst_ctx.device_type = dst_dev_type; - dst_ctx.device_id = 0; - auto dst_domain = DeviceType(dst_ctx); + Device dst_dev; + dst_dev.device_type = dst_dev_type; + dst_dev.device_id = 0; + auto dst_domain = DeviceType(dst_dev); for (const auto& it : outputs) { auto lhs = DeviceFor(it); Unify(lhs, dst_domain); @@ -387,9 +388,9 @@ class ContextAnalyzer : public MixedModeVisitor { for (const auto& it : expr_to_device_) { auto device = Lookup(it.second); if (device->IsEmptyDomain()) { - ret[it.first] = default_context_; + ret[it.first] = default_device_; } else { - ret[it.first] = device->ctx_; + ret[it.first] = device->device_; } } @@ -478,14 +479,14 @@ class ContextAnalyzer : public MixedModeVisitor { // The arguments of alloc storage should be on CPU. for (int i = 0; i < 2; i++) { - Unify(DeviceFor(call->args[i]), DeviceType(cpu_ctx_)); + Unify(DeviceFor(call->args[i]), DeviceType(cpu_dev_)); MixedModeVisitor::VisitExpr(call->args[i]); } - TVMContext ctx; + Device dev; const auto* attrs = call->attrs.as(); - ctx.device_type = static_cast(attrs->device_type); - ctx.device_id = attrs->device_id; - Unify(DeviceFor(GetRef(call)), DeviceType(ctx)); + dev.device_type = static_cast(attrs->device_type); + dev.device_id = attrs->device_id; + Unify(DeviceFor(GetRef(call)), DeviceType(dev)); } void UnifyAllocTensorCall(const CallNode* call) { @@ -497,14 +498,14 @@ class ContextAnalyzer : public MixedModeVisitor { Unify(DeviceFor(storage), DeviceFor(GetRef(call))); // The shape for alloc_tensor should be on CPU. - Unify(DeviceFor(shape), DeviceType(cpu_ctx_)); + Unify(DeviceFor(shape), DeviceType(cpu_dev_)); MixedModeVisitor::VisitExpr(shape); } void UnifyShapeFuncCall(const CallNode* call) { // [func, inputs, outputs] ICHECK_EQ(call->args.size(), 3U); - auto shape_func_domain = DeviceType(cpu_ctx_); + auto shape_func_domain = DeviceType(cpu_dev_); // No need to unify the op of a shape_func as shape_func doesn't // invoke the op itself. It should be handled by invoke_tvm_op. @@ -539,7 +540,7 @@ class ContextAnalyzer : public MixedModeVisitor { // a tensor regardless its device type. // Instead, the device type of the input is left for its other consumers to // unify or it will fallback to the default context. - Unify(DeviceFor(GetRef(call)), DeviceType(cpu_ctx_)); + Unify(DeviceFor(GetRef(call)), DeviceType(cpu_dev_)); } void UnifyReshapeTensorCall(const CallNode* call) { @@ -550,7 +551,7 @@ class ContextAnalyzer : public MixedModeVisitor { Unify(DeviceFor(GetRef(call)), DeviceFor(data)); // The shape field of reshape_tensor is always on the CPU. - Unify(DeviceFor(shape), DeviceType(cpu_ctx_)); + Unify(DeviceFor(shape), DeviceType(cpu_dev_)); MixedModeVisitor::VisitExpr(data); MixedModeVisitor::VisitExpr(shape); } @@ -668,13 +669,13 @@ class ContextAnalyzer : public MixedModeVisitor { private: /* \brief The cpu context. */ - TVMContext cpu_ctx_; + Device cpu_dev_; /* \brief The module that helps context analysis. */ const IRModule& mod_; /* \brief The current function that is being analyzed. */ GlobalVar current_func_; /* \brief The default device that could be attached to an expression. */ - const TVMContext& default_context_; + const Device& default_device_; /* \brief The IR node to device domain mapping. */ std::unordered_map expr_to_device_; @@ -690,21 +691,20 @@ class ContextAnalyzer : public MixedModeVisitor { } // namespace analysis -AnalysisResultMap ContextAnalysis(const IRModule& mod, const TVMContext& default_context) { +AnalysisResultMap ContextAnalysis(const IRModule& mod, const Device& default_device) { // TODO(@zhiics) Apply the pass to all functions/entries auto entry = mod->GetGlobalVar("main"); - auto ca = analysis::ContextAnalyzer(mod, entry, default_context); + auto ca = analysis::ContextAnalyzer(mod, entry, default_device); auto expr = mod->Lookup(entry); ca.VisitExpr(expr); return ca.Results(); } -// Unpack the device type and deivce id fields in TVMContext for PackedFunc calls -// as TVMContext is not in the object system. -PackedAnalysisResultMap ContextAnalysisPacked(const IRModule& mod, - const TVMContext& default_context) { +// Unpack the device type and deivce id fields in Device for PackedFunc calls +// as Device is not in the object system. +PackedAnalysisResultMap ContextAnalysisPacked(const IRModule& mod, const Device& default_device) { PackedAnalysisResultMap ret; - auto res = ContextAnalysis(mod, default_context); + auto res = ContextAnalysis(mod, default_device); for (const auto& it : res) { Integer dev_ty = static_cast(it.second.device_type); Integer dev_id = it.second.device_id; diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc index 90750575b9d4b..a4120d20288fc 100644 --- a/src/relay/analysis/util.cc +++ b/src/relay/analysis/util.cc @@ -370,7 +370,7 @@ std::unordered_map GetExprRefCount(const Expr& body) { template bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) { - ICHECK_EQ(tensor->ctx.device_type, kDLCPU); + ICHECK_EQ(tensor->device.device_type, kDLCPU); ICHECK(tensor->strides == nullptr); ICHECK_EQ(tensor->byte_offset, 0); const T* data = static_cast(tensor->data); diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 15173c2c79db8..26bc77aa6ec53 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -42,7 +42,7 @@ struct StorageToken { /*! \brief The corresponding tensor type node. */ const TensorTypeNode* ttype{nullptr}; /*! \brief virtual device index that corresponds to the device_type in - * DLContext. */ + * DLDevice. */ int device_type{0}; /*! \brief The storage id */ int64_t storage_id{-1}; diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc index 993fb1a62787a..eeba010dc1644 100644 --- a/src/relay/backend/interpreter.cc +++ b/src/relay/backend/interpreter.cc @@ -212,8 +212,8 @@ InterpreterState::InterpreterState(Expr current_expr, InterpreterState::Stack st class Interpreter : public ExprFunctor, PatternFunctor { public: - Interpreter(IRModule mod, DLContext context, Target target) - : mod_(mod), context_(context), target_(target), debug_op_(Op::Get("debug")) { + Interpreter(IRModule mod, Device device, Target target) + : mod_(mod), device_(device), target_(target), debug_op_(Op::Get("debug")) { engine_ = CompileEngine::Global(); } @@ -243,7 +243,7 @@ class Interpreter : public ExprFunctor, return ObjectRef(); } - ObjectRef VisitExpr_(const ConstantNode* op) final { return op->data.CopyTo(context_); } + ObjectRef VisitExpr_(const ConstantNode* op) final { return op->data.CopyTo(device_); } ObjectRef VisitExpr_(const TupleNode* op) final { std::vector values; @@ -294,9 +294,9 @@ class Interpreter : public ExprFunctor, std::vector inputs(cfunc->inputs.size()); std::vector outputs(cfunc->outputs.size()); - DLContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; + Device cpu_dev; + cpu_dev.device_type = kDLCPU; + cpu_dev.device_id = 0; auto fset_input = [&](size_t i, ObjectRef val, bool need_shape) { auto nd_array = Downcast(val); @@ -304,9 +304,9 @@ class Interpreter : public ExprFunctor, int64_t ndim = nd_array.Shape().size(); NDArray shape_arr; if (ndim == 0) { - shape_arr = NDArray::Empty({}, DataType::Int(64), cpu_ctx); + shape_arr = NDArray::Empty({}, DataType::Int(64), cpu_dev); } else { - shape_arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx); + shape_arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_dev); int64_t* data = reinterpret_cast(shape_arr->data); for (auto j = 0; j < ndim; ++j) { data[j] = nd_array.Shape()[j]; @@ -315,7 +315,7 @@ class Interpreter : public ExprFunctor, inputs[i] = shape_arr; setter(i, shape_arr); } else { - auto arr = nd_array.CopyTo(cpu_ctx); + auto arr = nd_array.CopyTo(cpu_dev); inputs[i] = arr; setter(i, arr); } @@ -354,7 +354,7 @@ class Interpreter : public ExprFunctor, const TensorTypeNode* rtype = val_type.as(); ICHECK(rtype != nullptr); int64_t ndim = rtype->shape.size(); - auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx); + auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_dev); outputs[i] = arr; setter(arg_counter + i, arr); }; @@ -438,9 +438,9 @@ class Interpreter : public ExprFunctor, auto fset_input = [&](size_t i, ObjectRef val) { const auto nd_array = Downcast(val); setter(i, nd_array); - DLContext arg_ctx = nd_array->ctx; - ICHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id) - << "Interpreter expect context to be " << context_ << ", but get " << arg_ctx; + Device arg_dev = nd_array->device; + ICHECK(arg_dev.device_type == device_.device_type && arg_dev.device_id == device_.device_id) + << "Interpreter expect device to be " << device_ << ", but get " << arg_dev; }; int arg_counter = 0; @@ -470,7 +470,7 @@ class Interpreter : public ExprFunctor, shape.push_back(ivalue[0]); } DLDataType dtype = rtype->dtype; - NDArray nd_array = NDArray::Empty(shape, dtype, context_); + NDArray nd_array = NDArray::Empty(shape, dtype, device_); setter(num_inputs + i, nd_array); return nd_array; }; @@ -603,10 +603,10 @@ class Interpreter : public ExprFunctor, ObjectRef v = Eval(op->cond); if (v->IsInstance()) { auto nd_array = Downcast(v); - DLContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - NDArray cpu_array = nd_array.CopyTo(cpu_ctx); + Device cpu_dev; + cpu_dev.device_type = kDLCPU; + cpu_dev.device_id = 0; + NDArray cpu_array = nd_array.CopyTo(cpu_dev); ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool()); // TODO(@jroesch, @MK): Refactor code into helper from DCE. if (reinterpret_cast(cpu_array->data)[0]) { @@ -704,7 +704,7 @@ class Interpreter : public ExprFunctor, IRModule mod_; // For simplicity we only run the interpreter on a single context. // Context to run the interpreter on. - DLContext context_; + Device device_; // Target parameter being used by the interpreter. Target target_; // Object stack. @@ -715,7 +715,7 @@ class Interpreter : public ExprFunctor, const Op& debug_op_; }; -TypedPackedFunc CreateInterpreter(IRModule mod, DLContext context, Target target) { +TypedPackedFunc CreateInterpreter(IRModule mod, Device device, Target target) { if (mod.defined()) { // eta expand to support constructors in argument position transform::Sequential seq({transform::EtaExpand( @@ -727,7 +727,7 @@ TypedPackedFunc CreateInterpreter(IRModule mod, DLContext conte mod = seq(mod); } - auto intrp = std::make_shared(mod, context, target); + auto intrp = std::make_shared(mod, device, target); auto packed = [intrp](Expr expr) { auto f = DetectFeature(expr); ICHECK(f.is_subset_of(FeatureSet::All() - fGraph)); diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 9d3ffc558aaeb..dafaed111c03f 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -1170,7 +1170,7 @@ void VMCompiler::Codegen() { } ExprDeviceMap VMCompiler::AnalyzeContext() const { - TVMContext default_device; + Device default_device; ExprDeviceMap expr_device_map; if (targets_.size() > 1) { int fallback_dev = GetFallbackDevice(); diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h index 9c813a4f561c8..3a3796373a614 100644 --- a/src/relay/backend/vm/compiler.h +++ b/src/relay/backend/vm/compiler.h @@ -62,7 +62,7 @@ using GlobalMap = NodeMap; using ConstMap = NodeMap; using ConstTensorShapeMap = NodeMap>; using TargetsMap = Map; -using ExprDeviceMap = std::unordered_map; +using ExprDeviceMap = std::unordered_map; struct VMCompilerContext { // The module context for the compilation diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc index 287564ba4f21a..c2997fb6cf958 100644 --- a/src/relay/op/memory/memory.cc +++ b/src/relay/op/memory/memory.cc @@ -48,11 +48,11 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs); // The passing value in attrs and args doesn't seem super great. // We should consider a better solution, i.e the type relation // being able to see the arguments as well? -Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) { +Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint) { auto attrs = make_object(); attrs->dtype = dtype_hint; - attrs->device_id = ctx.device_id; - attrs->device_type = ctx.device_type; + attrs->device_id = dev.device_id; + attrs->device_type = dev.device_type; static const Op& op = Op::Get("memory.alloc_storage"); return Call(op, {size, alignment}, Attrs(attrs), {}); } diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h index 6e184507bad54..bbbd11867549d 100644 --- a/src/relay/op/memory/memory.h +++ b/src/relay/op/memory/memory.h @@ -32,7 +32,7 @@ namespace tvm { namespace relay { -Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint); +Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint); Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type); Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype, Array assert_shape); diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc index 9416b0ec4580b..fe5f547449ad5 100644 --- a/src/relay/transforms/fold_constant.cc +++ b/src/relay/transforms/fold_constant.cc @@ -248,16 +248,16 @@ class ConstantFolder : public MixedModeMutator { expr = expr.as() == nullptr ? entry_func->body : entry_func; using tvm::transform::PassContext; - DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; + Device dev; + dev.device_type = kDLCPU; + dev.device_id = 0; Target target = Target("llvm"); // use a fresh build context // in case we are already in a build context. // needed for both execution and creation(due to JIT) With fresh_build_ctx(PassContext::Create()); - FInterpreter executor = CreateInterpreter(mod, ctx, target); + FInterpreter executor = CreateInterpreter(mod, dev, target); return ObjectToExpr(executor(expr)); } @@ -276,17 +276,17 @@ class ConstantFolder : public MixedModeMutator { } // Get the constant shape - DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; + Device dev; + dev.device_type = kDLCPU; + dev.device_id = 0; runtime::NDArray value; DLDataType cdtype = DataType::Int(32); if (ishape.size() == 0) { - value = runtime::NDArray::Empty({}, cdtype, ctx); + value = runtime::NDArray::Empty({}, cdtype, dev); } else { ICHECK_NE(ishape.size(), 0); std::vector cshape = {static_cast(ishape.size())}; - value = runtime::NDArray::Empty(cshape, cdtype, ctx); + value = runtime::NDArray::Empty(cshape, cdtype, dev); int32_t* dims = static_cast(value->data); using ::tvm::tir::IntImmNode; for (size_t i = 0; i < ishape.size(); ++i) { @@ -301,7 +301,7 @@ class ConstantFolder : public MixedModeMutator { Constant shape = Downcast(ObjectToExpr(value)); if (shape->data.Shape().size() == 0 && GetScalarFromConstant(shape) == 0) { - auto ndarray = runtime::NDArray::Empty({}, cdtype, ctx); + auto ndarray = runtime::NDArray::Empty({}, cdtype, dev); shape = Constant(ndarray); } @@ -323,12 +323,12 @@ class ConstantFolder : public MixedModeMutator { } // Get the constant size - DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; + Device dev; + dev.device_type = kDLCPU; + dev.device_id = 0; runtime::NDArray value; DLDataType cdtype = DataType::Int(32); - value = runtime::NDArray::Empty({}, cdtype, ctx); + value = runtime::NDArray::Empty({}, cdtype, dev); int32_t* data = static_cast(value->data); if (ishape.size() == 0) { *data = 0; diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc index f75b7ba1fc750..dd0cfc85a5108 100644 --- a/src/relay/transforms/memory_alloc.cc +++ b/src/relay/transforms/memory_alloc.cc @@ -52,7 +52,7 @@ namespace tvm { namespace relay { using AnalysisResultMap = - std::unordered_map; + std::unordered_map; inline Constant MakeConstant(const std::vector& value) { return MakeConstantTensor(DataType::Int(64), {static_cast(value.size())}, value); @@ -104,8 +104,8 @@ class DialectRewriter : public ExprMutator { DialectRewriter(const Target& target_host, const AnalysisResultMap& context_analysis_map) : target_host_(target_host), context_analysis_map_(context_analysis_map) {} - // Get the context of an expression. - TVMContext GetContext(const Expr& expr) const { + // Get the device of an expression. + Device GetDevice(const Expr& expr) const { auto it = context_analysis_map_.find(expr); CHECK(it != context_analysis_map_.end()) << "Cannot find expr in the context analysis map:\n" << AsText(expr, false); @@ -189,8 +189,8 @@ class DialectRewriter : public ExprMutator { // Handle the static case Array outs; for (size_t i = 0; i < out_types.size(); ++i) { - TVMContext ctx = GetContext(GetRef(cn)); - auto out = MakeStaticAllocation(&scope, out_types[i], ctx, std::to_string(i)); + Device dev = GetDevice(GetRef(cn)); + auto out = MakeStaticAllocation(&scope, out_types[i], dev, std::to_string(i)); outs.push_back(out); } Tuple output(outs); @@ -206,8 +206,8 @@ class DialectRewriter : public ExprMutator { private: // Insert a device copy node. - Expr DeviceCopy(const Expr& inp, int src_ctx, int dst_ctx) { - return ExprMutator::Mutate(relay::DeviceCopy(inp, src_ctx, dst_ctx)); + Expr DeviceCopy(const Expr& inp, int src_dev, int dst_dev) { + return ExprMutator::Mutate(relay::DeviceCopy(inp, src_dev, dst_dev)); } // Check if a call invokes a primitive function. @@ -262,8 +262,7 @@ class DialectRewriter : public ExprMutator { } // Allocate a tensor with a statically known shape. - Var MakeStaticAllocation(LetList* scope, const TensorType& type, TVMContext ctx, - String name_hint) { + Var MakeStaticAllocation(LetList* scope, const TensorType& type, Device dev, String name_hint) { std::vector int_shape; for (auto it : type->shape) { const auto* imm = it.as(); @@ -275,7 +274,7 @@ class DialectRewriter : public ExprMutator { Expr alignment = ComputeAlignment(type->dtype); // Run type inference later to get the correct type. Var var("storage_" + name_hint, Type(nullptr)); - Expr value = AllocStorage(size, alignment, ctx, type->dtype); + Expr value = AllocStorage(size, alignment, dev, type->dtype); auto sto = scope->Push(var, value); // TODO(@jroesch): There is a bug with typing based on the constant shape. @@ -295,7 +294,7 @@ class DialectRewriter : public ExprMutator { Array is_inputs; int input_pos = 0; - TVMContext cpu_ctx = default_context_; + Device cpu_dev = default_device_; CHECK_EQ(new_args.size(), input_states.size()); for (size_t i = 0; i < new_args.size(); ++i) { Expr arg = new_args[i]; @@ -318,9 +317,9 @@ class DialectRewriter : public ExprMutator { is_inputs.push_back(0); } else if (state == 1) { auto new_arg = ExprMutator::Mutate(arg); - auto ctx = GetContext(arg); - if (ctx.device_type != cpu_ctx.device_type) { - new_arg = DeviceCopy(new_arg, ctx.device_type, cpu_ctx.device_type); + auto dev = GetDevice(arg); + if (dev.device_type != cpu_dev.device_type) { + new_arg = DeviceCopy(new_arg, dev.device_type, cpu_dev.device_type); } Var in_shape_var("in_shape_" + std::to_string(input_pos), Type(nullptr)); shape_func_ins.push_back(scope->Push(in_shape_var, new_arg)); @@ -338,7 +337,7 @@ class DialectRewriter : public ExprMutator { auto tt = TensorType(out->shape, out->dtype); // Put shape func on CPU. This also ensures that everything between // shape_of and shape_func are on CPU. - auto alloc = MakeStaticAllocation(scope, tt, cpu_ctx, std::to_string(i)); + auto alloc = MakeStaticAllocation(scope, tt, cpu_dev, std::to_string(i)); Var shape_func_out_var("shape_func_out_" + std::to_string(i), Type(nullptr)); alloc = scope->Push(shape_func_out_var, alloc); out_shapes.push_back(alloc); @@ -355,7 +354,7 @@ class DialectRewriter : public ExprMutator { const Type& ret_type) { auto out_shapes = EmitShapeFunc(scope, func, new_args); std::vector storages; - auto func_ctx = GetContext(func); + auto func_dev = GetDevice(func); CHECK_EQ(out_shapes.size(), out_types.size()); for (size_t i = 0; i < out_shapes.size(); ++i) { auto out_shape = out_shapes[i]; @@ -363,7 +362,7 @@ class DialectRewriter : public ExprMutator { auto size = ComputeStorageInRelay(out_shape, out_type); auto alignment = ComputeAlignment(out_type->dtype); Var sto_var("storage_" + std::to_string(i), Type(nullptr)); - auto val = AllocStorage(size, alignment, func_ctx, out_type->dtype); + auto val = AllocStorage(size, alignment, func_dev, out_type->dtype); storages.push_back(scope->Push(sto_var, val)); } @@ -409,7 +408,7 @@ class DialectRewriter : public ExprMutator { std::vector scopes_; runtime::DataType compute_dtype_ = runtime::DataType::Int(64); - TVMContext default_context_{kDLCPU, 0}; + Device default_device_{kDLCPU, 0}; }; namespace transform { @@ -423,21 +422,21 @@ Pass ManifestAlloc(Target target_host, Map targets) { mod->ImportFromStd("core.rly"); mod = relay::transform::InferType()(mod); - TVMContext fallback_ctx; + Device fallback_dev; if (targets.size() > 1) { auto pass_ctx = PassContext::Current(); - Optional opt_fallback_dev = + Optional opt_fallback_dev_type = pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast(kDLCPU))); - auto fallback_dev = opt_fallback_dev.value(); - CHECK_GT(fallback_dev->value, 0U); - fallback_ctx.device_type = static_cast(fallback_dev->value); - fallback_ctx.device_id = 0; + auto fallback_dev_type = opt_fallback_dev_type.value(); + CHECK_GT(fallback_dev_type->value, 0U); + fallback_dev.device_type = static_cast(fallback_dev_type->value); + fallback_dev.device_id = 0; } else { const auto& it = targets.begin(); - fallback_ctx.device_type = static_cast((*it).first->value); - fallback_ctx.device_id = 0; + fallback_dev.device_type = static_cast((*it).first->value); + fallback_dev.device_id = 0; } - auto ca = ContextAnalysis(mod, fallback_ctx); + auto ca = ContextAnalysis(mod, fallback_dev); auto glob_funcs = mod->functions; for (const auto& it : glob_funcs) { diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc index 3a87aa8ed4988..9572faf087144 100644 --- a/src/relay/transforms/partial_eval.cc +++ b/src/relay/transforms/partial_eval.cc @@ -526,11 +526,11 @@ bool StatefulOp(const Expr& e) { using FInterpreter = runtime::TypedPackedFunc; -DLContext CPUContext() { - DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; - return ctx; +Device CPUDevice() { + Device dev; + dev.device_type = kDLCPU; + dev.device_id = 0; + return dev; } FInterpreter CPUInterpreter() { @@ -541,7 +541,7 @@ FInterpreter CPUInterpreter() { // in case we are already in a build context. With fresh_build_ctx(PassContext::Create()); - return CreateInterpreter(IRModule(nullptr), CPUContext(), target); + return CreateInterpreter(IRModule(nullptr), CPUDevice(), target); } using FuncId = int; @@ -613,7 +613,7 @@ class PartialEvaluator : public ExprFunctor } PStatic VisitExpr_(const ConstantNode* op, LetList* ll) final { - return HasStatic(MkSTensor(op->data.CopyTo(context_)), ll->Push(GetRef(op))); + return HasStatic(MkSTensor(op->data.CopyTo(device_)), ll->Push(GetRef(op))); } PStatic VisitExpr_(const TupleNode* op, LetList* ll) final { @@ -669,7 +669,7 @@ class PartialEvaluator : public ExprFunctor PStatic VisitExpr_(const IfNode* op, LetList* ll) final { PStatic c = VisitExpr(op->cond, ll); if (c->pstatic.defined()) { - NDArray cpu_array = Downcast(c->pstatic)->data.CopyTo(CPUContext()); + NDArray cpu_array = Downcast(c->pstatic)->data.CopyTo(CPUDevice()); ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool()); if (reinterpret_cast(cpu_array->data)[0]) { return VisitExpr(op->true_branch, ll); @@ -754,7 +754,7 @@ class PartialEvaluator : public ExprFunctor if (ps->pstatic.defined()) { if (auto* st = ps->pstatic.as()) { if (st->data.Shape().empty()) { - NDArray cpu_array = st->data.CopyTo(CPUContext()); + NDArray cpu_array = st->data.CopyTo(CPUDevice()); DataType dtype = DataType(cpu_array->dtype); if (dtype == DataType::Int(32)) { return std::max(0, *static_cast(cpu_array->data)); @@ -1136,7 +1136,7 @@ class PartialEvaluator : public ExprFunctor std::unordered_map func_map_; std::unordered_map fuel_map_; Store store_; - DLContext context_ = CPUContext(); + Device device_ = CPUDevice(); FInterpreter executor_ = CPUInterpreter(); }; diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 150d7f215da5a..b9e8c2549fd57 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -92,7 +92,7 @@ class DeviceAPIManager { public: static const int kMaxDeviceAPI = 32; // Get API - static DeviceAPI* Get(const TVMContext& ctx) { return Get(ctx.device_type); } + static DeviceAPI* Get(const Device& dev) { return Get(dev.device_type); } static DeviceAPI* Get(int dev_type, bool allow_missing = false) { return Global()->GetAPI(dev_type, allow_missing); } @@ -136,12 +136,12 @@ class DeviceAPIManager { } }; -DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) { - return DeviceAPIManager::Get(static_cast(ctx.device_type), allow_missing); +DeviceAPI* DeviceAPI::Get(Device dev, bool allow_missing) { + return DeviceAPIManager::Get(static_cast(dev.device_type), allow_missing); } -void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { - return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); +void* DeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { + return AllocDataSpace(dev, size, kTempAllocaAlignment, type_hint); } static size_t GetDataAlignment(const DLDataType dtype) { @@ -150,13 +150,13 @@ static size_t GetDataAlignment(const DLDataType dtype) { return align; } -void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, +void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { if (!mem_scope.defined() || mem_scope.value() == "global") { // by default, we can always redirect to the flat memory allocations DLTensor temp; temp.data = nullptr; - temp.ctx = ctx; + temp.device = dev; temp.ndim = ndim; temp.dtype = dtype; temp.shape = const_cast(shape); @@ -164,7 +164,7 @@ void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, temp.byte_offset = 0; size_t size = GetDataSize(temp); size_t alignment = GetDataAlignment(temp.dtype); - return AllocDataSpace(ctx, size, alignment, dtype); + return AllocDataSpace(dev, size, alignment, dtype); } LOG(FATAL) << "Device does not support allocate data space with " << "specified memory scope: " << mem_scope.value(); @@ -178,29 +178,28 @@ void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle str ICHECK(IsContiguous(*from) && IsContiguous(*to)) << "CopyDataFromTo only support contiguous array for now"; - CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx, - to->ctx, from->dtype, stream); + CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->device, + to->device, from->dtype, stream); } void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) { LOG(FATAL) << "Device does not support CopyDataFromTo."; } -void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); } +void DeviceAPI::FreeWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); } -TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) { +TVMStreamHandle DeviceAPI::CreateStream(Device dev) { LOG(FATAL) << "Device does not support stream api."; return nullptr; } -void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) { +void DeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) { LOG(FATAL) << "Device does not support stream api."; } -void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, - TVMStreamHandle event_dst) { +void DeviceAPI::SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { LOG(FATAL) << "Device does not support stream api."; } @@ -430,23 +429,23 @@ int TVMBackendGetFuncFromEnv(void* mod_node, const char* func_name, TVMFunctionH void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; DLDataType type_hint; type_hint.code = static_cast(dtype_code_hint); type_hint.bits = static_cast(dtype_bits_hint); type_hint.lanes = 1; - return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx, static_cast(size), type_hint); + return DeviceAPIManager::Get(dev)->AllocWorkspace(dev, static_cast(size), type_hint); } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - DeviceAPIManager::Get(ctx)->FreeWorkspace(ctx, ptr); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + DeviceAPIManager::Get(dev)->FreeWorkspace(dev, ptr); return 0; } @@ -538,47 +537,47 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out) { API_BEGIN(); - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + *out = DeviceAPIManager::Get(dev)->CreateStream(dev); API_END(); } int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream) { API_BEGIN(); - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + DeviceAPIManager::Get(dev)->FreeStream(dev, stream); API_END(); } int TVMSetStream(int device_type, int device_id, TVMStreamHandle stream) { API_BEGIN(); - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - DeviceAPIManager::Get(ctx)->SetStream(ctx, stream); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + DeviceAPIManager::Get(dev)->SetStream(dev, stream); API_END(); } int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream) { API_BEGIN(); - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + DeviceAPIManager::Get(dev)->StreamSync(dev, stream); API_END(); } int TVMStreamStreamSynchronize(int device_type, int device_id, TVMStreamHandle src, TVMStreamHandle dst) { API_BEGIN(); - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst); + DLDevice dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + DeviceAPIManager::Get(dev)->SyncStreamFromTo(dev, src, dst); API_END(); } @@ -590,64 +589,64 @@ int TVMCbArgToReturn(TVMValue* value, int* code) { API_END(); } -int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint, +int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data) { API_BEGIN(); - out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, nbytes, alignment, type_hint); + out_data[0] = DeviceAPIManager::Get(dev)->AllocDataSpace(dev, nbytes, alignment, type_hint); API_END(); } -int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, - DLDataType dtype, const char* mem_scope, void** out_data) { +int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype, + const char* mem_scope, void** out_data) { API_BEGIN(); Optional scope; if (mem_scope != nullptr) { scope = String(std::string(mem_scope)); } - out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope); + out_data[0] = DeviceAPIManager::Get(dev)->AllocDataSpace(dev, ndim, shape, dtype, scope); API_END(); } -int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) { +int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { API_BEGIN(); - DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr); + DeviceAPIManager::Get(dev)->FreeDataSpace(dev, ptr); API_END(); } int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { API_BEGIN(); - TVMContext ctx_from = from->ctx; - TVMContext ctx_to = to->ctx; - TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to; - DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream); + DLDevice dev_from = from->device; + DLDevice dev_to = to->device; + DLDevice dev = dev_from.device_type != kDLCPU ? dev_from : dev_to; + DeviceAPIManager::Get(dev)->CopyDataFromTo(from, to, stream); API_END(); } // set device api TVM_REGISTER_GLOBAL(tvm::runtime::symbol::tvm_set_device) .set_body([](TVMArgs args, TVMRetValue* ret) { - TVMContext ctx; - ctx.device_type = static_cast(args[0].operator int()); - ctx.device_id = args[1]; - DeviceAPIManager::Get(ctx)->SetDevice(ctx); + DLDevice dev; + dev.device_type = static_cast(args[0].operator int()); + dev.device_id = args[1]; + DeviceAPIManager::Get(dev)->SetDevice(dev); }); // set device api TVM_REGISTER_GLOBAL("runtime.GetDeviceAttr").set_body([](TVMArgs args, TVMRetValue* ret) { - TVMContext ctx; - ctx.device_type = static_cast(args[0].operator int()); - ctx.device_id = args[1]; + DLDevice dev; + dev.device_type = static_cast(args[0].operator int()); + dev.device_id = args[1]; DeviceAttrKind kind = static_cast(args[2].operator int()); if (kind == kExist) { - DeviceAPI* api = DeviceAPIManager::Get(ctx.device_type, true); + DeviceAPI* api = DeviceAPIManager::Get(dev.device_type, true); if (api != nullptr) { - api->GetAttr(ctx, kind, ret); + api->GetAttr(dev, kind, ret); } else { *ret = 0; } } else { - DeviceAPIManager::Get(ctx)->GetAttr(ctx, kind, ret); + DeviceAPIManager::Get(dev)->GetAttr(dev, kind, ret); } }); diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc index f9a67010e6e2d..b843841f57556 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc @@ -30,10 +30,10 @@ namespace contrib { void* ACLAllocator::allocate(size_t size, size_t alignment) { ICHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero"; - return this->device_api_->AllocWorkspace(this->ctx_, size, {}); + return this->device_api_->AllocWorkspace(this->device_, size, {}); } -void ACLAllocator::free(void* ptr) { this->device_api_->FreeWorkspace(this->ctx_, ptr); } +void ACLAllocator::free(void* ptr) { this->device_api_->FreeWorkspace(this->device_, ptr); } std::unique_ptr ACLAllocator::make_region(size_t size, size_t alignment) { @@ -43,7 +43,7 @@ std::unique_ptr ACLAllocator::make_region(size_t siz ACLMemoryRegion::ACLMemoryRegion(size_t size, size_t alignment) : IMemoryRegion(size), ptr_(nullptr) { if (size != 0) { - this->ptr_ = this->device_api_->AllocDataSpace(this->ctx_, size, alignment, {}); + this->ptr_ = this->device_api_->AllocDataSpace(this->device_, size, alignment, {}); } } @@ -56,7 +56,7 @@ ACLMemoryRegion::ACLMemoryRegion(void* ptr, size_t size) ACLMemoryRegion::~ACLMemoryRegion() { if (this->ptr_ != nullptr && !is_subregion_) { - this->device_api_->FreeDataSpace(this->ctx_, this->ptr_); + this->device_api_->FreeDataSpace(this->device_, this->ptr_); } } diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.h b/src/runtime/contrib/arm_compute_lib/acl_allocator.h index 49d0d0c764e81..d4e72a73314ff 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_allocator.h +++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.h @@ -74,9 +74,9 @@ class ACLAllocator : public arm_compute::IAllocator { private: /*! \brief Always allocate data in the context of the current CPU. */ - const TVMContext ctx_{kDLCPU, 0}; + const Device device_{kDLCPU, 0}; /*! \brief Device API which allows requests for memory from TVM. */ - runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_); + runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(device_); }; /*! @@ -125,9 +125,9 @@ class ACLMemoryRegion : public arm_compute::IMemoryRegion { /*! \brief A subregion doesn't manage TVM memory so we don't need to free it. */ bool is_subregion_ = false; /*! \brief Always allocate data in the context of the current CPU. */ - const TVMContext ctx_{kDLCPU, 0}; + const Device device_{kDLCPU, 0}; /*! \brief Device API which allows requests for memory from TVM. */ - runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_); + runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(device_); }; } // namespace contrib diff --git a/src/runtime/contrib/coreml/coreml_runtime.mm b/src/runtime/contrib/coreml/coreml_runtime.mm index 18d4f735a55e6..5aef10ed8adf9 100644 --- a/src/runtime/contrib/coreml/coreml_runtime.mm +++ b/src/runtime/contrib/coreml/coreml_runtime.mm @@ -98,11 +98,11 @@ LOG(FATAL) << "unexpected data type " << data_desc.dataType; } MLMultiArray* src = [output_ featureValueForName:name].multiArrayValue; - TVMContext cpu_ctx = { + Device cpu_dev = { .device_type = kDLCPU, .device_id = 0, }; - NDArray ret = NDArray::Empty(shape, dtype, cpu_ctx); + NDArray ret = NDArray::Empty(shape, dtype, cpu_dev); ret.CopyFromBytes(src.dataPointer, size); return ret; diff --git a/src/runtime/contrib/cudnn/conv_forward.cc b/src/runtime/contrib/cudnn/conv_forward.cc index 223a5b4fe435a..ad3b959338bb1 100644 --- a/src/runtime/contrib/cudnn/conv_forward.cc +++ b/src/runtime/contrib/cudnn/conv_forward.cc @@ -41,8 +41,8 @@ void ConvolutionForward(int mode, int format, int algo, int dims, int groups, co entry_ptr->conv_entry.tensor_format = static_cast(format); // Set Algo entry_ptr->conv_entry.fwd_algo = static_cast(algo); - // Set Ctx - entry_ptr->conv_entry.ctx = x->ctx; + // Set Device + entry_ptr->conv_entry.device = x->device; // Set Data Type entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2DLDataType(conv_dtype)); cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype); diff --git a/src/runtime/contrib/cudnn/cudnn_utils.cc b/src/runtime/contrib/cudnn/cudnn_utils.cc index cd934bcb70818..006064e57a19b 100644 --- a/src/runtime/contrib/cudnn/cudnn_utils.cc +++ b/src/runtime/contrib/cudnn/cudnn_utils.cc @@ -133,12 +133,12 @@ void ConvEntry::UpdateWorkspace(const size_t wsize) { CleanWorkspace(); } workspace_size = wsize; - workspace = cuda_api->AllocWorkspace(ctx, workspace_size); + workspace = cuda_api->AllocWorkspace(device, workspace_size); } } void ConvEntry::CleanWorkspace() { - if (workspace) cuda_api->FreeWorkspace(ctx, workspace); + if (workspace) cuda_api->FreeWorkspace(device, workspace); workspace_size = 0; } diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h index 9b8e9fb33f985..72380b64121aa 100644 --- a/src/runtime/contrib/cudnn/cudnn_utils.h +++ b/src/runtime/contrib/cudnn/cudnn_utils.h @@ -72,7 +72,7 @@ struct ConvEntry { cudnnTensorDescriptor_t output_desc; cudnnConvolutionFwdAlgo_t fwd_algo; // cudnnMathType_t math_type; - TVMContext ctx; + Device device; runtime::DeviceAPI* cuda_api; void* workspace{nullptr}; size_t workspace_size{0}; diff --git a/src/runtime/contrib/edgetpu/edgetpu_runtime.cc b/src/runtime/contrib/edgetpu/edgetpu_runtime.cc index 13b3c34a6b175..af6b3cc42bfd2 100644 --- a/src/runtime/contrib/edgetpu/edgetpu_runtime.cc +++ b/src/runtime/contrib/edgetpu/edgetpu_runtime.cc @@ -31,7 +31,7 @@ namespace tvm { namespace runtime { -void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) { +void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, Device dev) { const char* buffer = tflite_model_bytes.c_str(); size_t buffer_size = tflite_model_bytes.size(); // Load compiled model as a FlatBufferModel @@ -53,12 +53,12 @@ void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) status = interpreter_->AllocateTensors(); CHECK_TFLITE_STATUS(status) << "Failed to allocate tensors."; - ctx_ = ctx; + device_ = dev; } -Module EdgeTPURuntimeCreate(const std::string& tflite_model_bytes, TVMContext ctx) { +Module EdgeTPURuntimeCreate(const std::string& tflite_model_bytes, Device dev) { auto exec = make_object(); - exec->Init(tflite_model_bytes, ctx); + exec->Init(tflite_model_bytes, dev); return Module(exec); } diff --git a/src/runtime/contrib/edgetpu/edgetpu_runtime.h b/src/runtime/contrib/edgetpu/edgetpu_runtime.h index af3517ba76f3e..a7a57ff422e30 100644 --- a/src/runtime/contrib/edgetpu/edgetpu_runtime.h +++ b/src/runtime/contrib/edgetpu/edgetpu_runtime.h @@ -25,6 +25,8 @@ #ifndef TVM_RUNTIME_CONTRIB_EDGETPU_EDGETPU_RUNTIME_H_ #define TVM_RUNTIME_CONTRIB_EDGETPU_EDGETPU_RUNTIME_H_ +#include + #include #include @@ -47,11 +49,11 @@ class EdgeTPURuntime : public TFLiteRuntime { const char* type_key() const final { return "EdgeTPURuntime"; } /*! - * \brief Initialize the edge TPU tflite runtime with tflite model and context. + * \brief Initialize the edge TPU tflite runtime with tflite model and device. * \param tflite_model_bytes The tflite model. - * \param ctx The context where the tflite model will be executed on. + * \param dev The device where the tflite model will be executed on. */ - void Init(const std::string& tflite_model_bytes, TVMContext ctx); + void Init(const std::string& tflite_model_bytes, Device dev); private: std::shared_ptr edgetpu_context_; diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc index c51c4288b7095..0ffbee29032b2 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.cc +++ b/src/runtime/contrib/ethosn/ethosn_device.cc @@ -190,7 +190,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result") for (int argc = 0; argc < args.size(); argc++) { const DLTensor* tensor = args[argc]; auto shape = std::vector(tensor->shape, tensor->shape + tensor->ndim); - test_outputs.emplace_back(tvm::runtime::NDArray::Empty(shape, tensor->dtype, tensor->ctx)); + test_outputs.emplace_back( + tvm::runtime::NDArray::Empty(shape, tensor->dtype, tensor->device)); test_outputs[test_outputs.size() - 1].CopyFrom(tensor); } }); diff --git a/src/runtime/contrib/miopen/conv_forward.cc b/src/runtime/contrib/miopen/conv_forward.cc index e44a0650024bd..139801feef15f 100644 --- a/src/runtime/contrib/miopen/conv_forward.cc +++ b/src/runtime/contrib/miopen/conv_forward.cc @@ -59,8 +59,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args, if (n_group > 1) assert(mode > 1 && "Group /Depthwise Conv mode when num of groups > 1"); // Set Mode entry_ptr->conv_entry.mode = static_cast(mode); - // Set Ctx - entry_ptr->conv_entry.ctx = TVMContext{kDLROCM, 0}; + // Set Device + entry_ptr->conv_entry.device = Device{kDLROCM, 0}; // Set Data Type entry_ptr->conv_entry.data_type = static_cast(dtype); // MIOpen supports fp32(miopenFloat), fp16(miopenHalf), @@ -106,11 +106,11 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args, runtime::DeviceAPI* rocm_api = entry_ptr->conv_entry.rocm_api; float* input_buf = static_cast( - rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, input_size * sizeof(float))); + rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, input_size * sizeof(float))); float* filter_buf = static_cast( - rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, filter_size * sizeof(float))); + rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, filter_size * sizeof(float))); float* output_buf = static_cast( - rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, output_size * sizeof(float))); + rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, output_size * sizeof(float))); const int request_algo_count = 4; const bool exhaustive_search = false; @@ -125,9 +125,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args, entry_ptr->conv_entry.output_desc, output_buf, request_algo_count, &returned_algo_count, perfs, workspace, workspace_size, exhaustive_search)); - rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, input_buf); - rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, filter_buf); - rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, output_buf); + rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, input_buf); + rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, filter_buf); + rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, output_buf); const std::vector fwd_algo_names{ "miopenConvolutionFwdAlgoGEMM", @@ -166,8 +166,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.forward") entry_ptr->conv_entry.fwd_algo = static_cast(algo); // Set Mode entry_ptr->conv_entry.mode = static_cast(mode); - // Set Ctx - entry_ptr->conv_entry.ctx = x->ctx; + // Set Device + entry_ptr->conv_entry.device = x->device; // Set Data Type entry_ptr->conv_entry.data_type = static_cast(dtype); // MIOpen supports fp32(miopenFloat), diff --git a/src/runtime/contrib/miopen/miopen_utils.cc b/src/runtime/contrib/miopen/miopen_utils.cc index a57918045d87b..426d2f24ddf53 100644 --- a/src/runtime/contrib/miopen/miopen_utils.cc +++ b/src/runtime/contrib/miopen/miopen_utils.cc @@ -80,12 +80,12 @@ void ConvEntry::UpdateWorkspace(const size_t wsize) { CleanWorkspace(); } workspace_size = wsize; - workspace = rocm_api->AllocWorkspace(ctx, workspace_size); + workspace = rocm_api->AllocWorkspace(device, workspace_size); } } void ConvEntry::CleanWorkspace() { - if (workspace) rocm_api->FreeWorkspace(ctx, workspace); + if (workspace) rocm_api->FreeWorkspace(device, workspace); workspace_size = 0; } diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h index e5a769a974f02..d3a8c7b9ad644 100644 --- a/src/runtime/contrib/miopen/miopen_utils.h +++ b/src/runtime/contrib/miopen/miopen_utils.h @@ -52,7 +52,7 @@ struct ConvEntry { miopenTensorDescriptor_t input_desc; miopenTensorDescriptor_t output_desc; miopenConvFwdAlgorithm_t fwd_algo; - TVMContext ctx; + Device device; runtime::DeviceAPI* rocm_api; void* workspace{nullptr}; size_t workspace_size{0}; diff --git a/src/runtime/contrib/mps/conv.mm b/src/runtime/contrib/mps/conv.mm index b860ee29bdf58..84ca5e10f980d 100644 --- a/src/runtime/contrib/mps/conv.mm +++ b/src/runtime/contrib/mps/conv.mm @@ -31,10 +31,10 @@ id mtlbuf = (__bridge id)(buf->data); MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal(); - id dev = entry_ptr->metal_api->GetDevice(buf->ctx); - id temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]); + id dev = entry_ptr->metal_api->GetDevice(buf->device); + id temp = rt->GetTempBuffer(buf->device, [mtlbuf length]); entry_ptr->metal_api->CopyDataFromTo((__bridge void*)mtlbuf, 0, (__bridge void*)temp, 0, - [mtlbuf length], buf -> ctx, buf -> ctx, buf -> dtype, + [mtlbuf length], buf -> device, buf -> device, buf -> dtype, nullptr); MPSImageDescriptor* desc = @@ -63,14 +63,14 @@ MPSImage* mpsimg = (__bridge MPSImage*)(img->data); MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal(); - id temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]); + id temp = rt->GetTempBuffer(buf->device, [mtlbuf length]); [mpsimg readBytes:[temp contents] dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels imageIndex:0]; entry_ptr->metal_api->CopyDataFromTo((__bridge void*)temp, 0, (__bridge void*)mtlbuf, 0, - [mtlbuf length], buf -> ctx, buf -> ctx, buf -> dtype, + [mtlbuf length], buf -> device, buf -> device, buf -> dtype, nullptr); }); @@ -102,8 +102,8 @@ // Get Metal device API MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal(); - id dev = entry_ptr->metal_api->GetDevice(data->ctx); - id queue = entry_ptr->metal_api->GetCommandQueue(data->ctx); + id dev = entry_ptr->metal_api->GetDevice(data->device); + id queue = entry_ptr->metal_api->GetCommandQueue(data->device); id cb = [queue commandBuffer]; // data to MPSImage DLTensor tmp_in; @@ -111,10 +111,10 @@ MPSImage* tempA = (__bridge MPSImage*)tmp_in.data; // weight to temp memory id bufB = (__bridge id)(weight->data); - id tempB = rt->GetTempBuffer(weight->ctx, [bufB length]); + id tempB = rt->GetTempBuffer(weight->device, [bufB length]); entry_ptr->metal_api->CopyDataFromTo((__bridge void*)bufB, 0, (__bridge void*)tempB, 0, - [bufB length], weight -> ctx, weight -> ctx, tmp_in.dtype, - nullptr); + [bufB length], weight -> device, weight -> device, + tmp_in.dtype, nullptr); float* ptr_w = (float*)[tempB contents]; // output to MPSImage DLTensor tmp_out; diff --git a/src/runtime/contrib/mps/gemm.mm b/src/runtime/contrib/mps/gemm.mm index c1d80dbed7f39..db3a80ba49d87 100644 --- a/src/runtime/contrib/mps/gemm.mm +++ b/src/runtime/contrib/mps/gemm.mm @@ -42,10 +42,10 @@ ICHECK(TypeMatch(C->dtype, kDLFloat, 32)); // Get Metal device API MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal(); - // ICHECK_EQ(A->ctx, B->ctx); - // ICHECK_EQ(A->ctx, C->ctx); - id dev = entry_ptr->metal_api->GetDevice(A->ctx); - id queue = entry_ptr->metal_api->GetCommandQueue(A->ctx); + // ICHECK_EQ(A->device, B->device); + // ICHECK_EQ(A->device, C->device); + id dev = entry_ptr->metal_api->GetDevice(A->device); + id queue = entry_ptr->metal_api->GetCommandQueue(A->device); id cb = [queue commandBuffer]; NSUInteger M = A->shape[0 + (transa ? 1 : 0)]; NSUInteger N = B->shape[1 - (transb ? 1 : 0)]; diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc index 0d63594959026..2362e31f92eef 100644 --- a/src/runtime/contrib/nnpack/convolution.cc +++ b/src/runtime/contrib/nnpack/convolution.cc @@ -99,12 +99,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference") // Division with rounding up, in case size is not multiple of sizeof(float) const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float); - TVMContext ctx = input->ctx; + Device dev = input->device; DLDataType type_hint = input->dtype; - DeviceAPI* cpu_api = DeviceAPI::Get(ctx); + DeviceAPI* cpu_api = DeviceAPI::Get(dev); void* workspace_buffer = - cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint); + cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint); ICHECK(workspace_buffer != nullptr); for (auto n = 0; n < input->shape[0]; ++n) { @@ -122,7 +122,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference") ICHECK_EQ(status, nnp_status_success); } - cpu_api->FreeWorkspace(ctx, workspace_buffer); + cpu_api->FreeWorkspace(dev, workspace_buffer); }); TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform") @@ -188,12 +188,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra // Division with rounding up, in case size is not multiple of sizeof(float) const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float); - TVMContext ctx = input->ctx; + Device dev = input->device; DLDataType type_hint = input->dtype; - DeviceAPI* cpu_api = DeviceAPI::Get(ctx); + DeviceAPI* cpu_api = DeviceAPI::Get(dev); void* workspace_buffer = - cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint); + cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint); ICHECK(workspace_buffer != nullptr); for (auto n = 0; n < input->shape[0]; ++n) { @@ -211,7 +211,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra ICHECK_EQ(status, nnp_status_success); } - cpu_api->FreeWorkspace(ctx, workspace_buffer); + cpu_api->FreeWorkspace(dev, workspace_buffer); }); TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform") diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc index 81f46b2dd5d5f..a1c6dc2498c88 100644 --- a/src/runtime/contrib/random/mt_random_engine.cc +++ b/src/runtime/contrib/random/mt_random_engine.cc @@ -82,7 +82,7 @@ class RandomEngine { ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1); - if (data->ctx.device_type == kDLCPU) { + if (data->device.device_type == kDLCPU) { std::uniform_real_distribution uniform_dist(low, high); std::generate_n(static_cast(data->data), size, [&]() { return uniform_dist(rnd_engine_); }); @@ -106,7 +106,7 @@ class RandomEngine { ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1); - if (data->ctx.device_type == kDLCPU) { + if (data->device.device_type == kDLCPU) { std::normal_distribution normal_dist(loc, scale); std::generate_n(static_cast(data->data), size, [&]() { return normal_dist(rnd_engine_); }); @@ -121,7 +121,7 @@ class RandomEngine { size *= data->shape[i]; } - if (data->ctx.device_type == kDLCPU) { + if (data->device.device_type == kDLCPU) { FillData(data, size); } else { runtime::NDArray local = runtime::NDArray::Empty( diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc index 2d111bc322ab7..2cb56b87fdf57 100644 --- a/src/runtime/contrib/random/random.cc +++ b/src/runtime/contrib/random/random.cc @@ -89,7 +89,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.randint").set_body([](TVMArgs args, TVMR low = std::max(low, numeric_low); high = std::min(high, numeric_high); - if (out->ctx.device_type == kDLCPU) { + if (out->device.device_type == kDLCPU) { // file the data with random byte std::generate_n(static_cast(out->data), size, [&]() { unsigned rint = entry->random_engine.GetRandInt(); diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc index 09b36d720877d..e98413eacc7c9 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc @@ -191,7 +191,7 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device) { - ICHECK_EQ(dptr->ctx.device_type, src_device); + ICHECK_EQ(dptr->device.device_type, src_device); ICHECK(static_cast(dptr->dtype.code) == kDLFloat || static_cast(dptr->dtype.code) == kDLInt); const auto trt_dtype = static_cast(dptr->dtype.code) == kDLFloat @@ -248,7 +248,7 @@ void TensorRTBuilder::CleanUp() { void TensorRTBuilder::AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name, std::vector* device_buffers) { const uint32_t entry_id = entry_id_map_[name]; - if (data_entry_[entry_id]->ctx.device_type != kDLGPU) { + if (data_entry_[entry_id]->device.device_type != kDLGPU) { const int binding_index = engine->getBindingIndex(name.c_str()); ICHECK_NE(binding_index, -1); std::vector shape(data_entry_[entry_id]->shape, diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index e28c5a8c61d04..e6eb28c10af69 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -135,7 +135,7 @@ class TensorRTRuntime : public JSONRuntimeBase { const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j); int binding_index = engine->getBindingIndex(name.c_str()); ICHECK_NE(binding_index, -1); - if (data_entry_[eid]->ctx.device_type == kDLGPU) { + if (data_entry_[eid]->device.device_type == kDLGPU) { bindings[binding_index] = data_entry_[eid]->data; } else { device_buffers[binding_index].CopyFrom(data_entry_[eid]); @@ -150,7 +150,7 @@ class TensorRTRuntime : public JSONRuntimeBase { const std::string& name = engine_and_context.outputs[i]; int binding_index = engine->getBindingIndex(name.c_str()); ICHECK_NE(binding_index, -1); - if (data_entry_[eid]->ctx.device_type == kDLGPU) { + if (data_entry_[eid]->device.device_type == kDLGPU) { bindings[binding_index] = data_entry_[eid]->data; } else { bindings[binding_index] = device_buffers[binding_index]->data; @@ -173,7 +173,7 @@ class TensorRTRuntime : public JSONRuntimeBase { const std::string& name = engine_and_context.outputs[i]; int binding_index = engine->getBindingIndex(name.c_str()); ICHECK_NE(binding_index, -1); - if (data_entry_[eid]->ctx.device_type != kDLGPU) { + if (data_entry_[eid]->device.device_type != kDLGPU) { device_buffers[binding_index].CopyTo(const_cast(data_entry_[eid])); } } diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc index 9a434fde29553..2d323ba51d98f 100644 --- a/src/runtime/contrib/tflite/tflite_runtime.cc +++ b/src/runtime/contrib/tflite/tflite_runtime.cc @@ -90,7 +90,7 @@ DataType TfLiteDType2TVMDType(TfLiteType dtype) { } } -void TFLiteRuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) { +void TFLiteRuntime::Init(const std::string& tflite_model_bytes, Device dev) { const char* buffer = tflite_model_bytes.c_str(); size_t buffer_size = tflite_model_bytes.size(); // The buffer used to construct the model must be kept alive for @@ -107,7 +107,7 @@ void TFLiteRuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) status = interpreter_->AllocateTensors(); CHECK_TFLITE_STATUS(status) << "Failed to allocate tensors."; - ctx_ = ctx; + device_ = dev; } void TFLiteRuntime::Invoke() { interpreter_->Invoke(); } @@ -140,7 +140,7 @@ NDArray TFLiteRuntime::GetOutput(int index) const { shape.push_back(dims->data[i]); size *= dims->data[i]; } - NDArray ret = NDArray::Empty(shape, dtype, ctx_); + NDArray ret = NDArray::Empty(shape, dtype, device_); TVM_DTYPE_DISPATCH(dtype, DType, { DType* dest = static_cast(ret->data); DType* src = interpreter_->typed_output_tensor(index); @@ -176,9 +176,9 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name, } } -Module TFLiteRuntimeCreate(const std::string& tflite_model_bytes, TVMContext ctx) { +Module TFLiteRuntimeCreate(const std::string& tflite_model_bytes, Device dev) { auto exec = make_object(); - exec->Init(tflite_model_bytes, ctx); + exec->Init(tflite_model_bytes, dev); return Module(exec); } diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h index 3311f10975bea..759be24b94ec7 100644 --- a/src/runtime/contrib/tflite/tflite_runtime.h +++ b/src/runtime/contrib/tflite/tflite_runtime.h @@ -67,11 +67,11 @@ class TFLiteRuntime : public ModuleNode { void Invoke(); /*! - * \brief Initialize the tflite runtime with tflite model and context. + * \brief Initialize the tflite runtime with tflite model and device. * \param tflite_model_bytes The tflite model. - * \param ctx The context where the tflite model will be executed on. + * \param dev The device where the tflite model will be executed on. */ - void Init(const std::string& tflite_model_bytes, TVMContext ctx); + void Init(const std::string& tflite_model_bytes, Device dev); /*! * \brief set index-th input to the model. @@ -103,8 +103,8 @@ class TFLiteRuntime : public ModuleNode { std::unique_ptr flatBuffersBuffer_; // TFLite interpreter std::unique_ptr interpreter_; - // TVM context - TVMContext ctx_; + // TVM device + Device device_; }; } // namespace runtime diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 133bb01d7d137..774335f5660be 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -38,14 +38,13 @@ namespace tvm { namespace runtime { class CPUDeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final {} - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + void SetDevice(Device dev) final {} + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { if (kind == kExist) { *rv = 1; } } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { void* ptr; #if _MSC_VER ptr = _aligned_malloc(nbytes, alignment); @@ -61,7 +60,7 @@ class CPUDeviceAPI final : public DeviceAPI { return ptr; } - void FreeDataSpace(TVMContext ctx, void* ptr) final { + void FreeDataSpace(Device dev, void* ptr) final { #if _MSC_VER _aligned_free(ptr); #else @@ -69,10 +68,10 @@ class CPUDeviceAPI final : public DeviceAPI { #endif } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {} + void StreamSync(Device dev, TVMStreamHandle stream) final {} - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; - void FreeWorkspace(TVMContext ctx, void* data) final; + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; + void FreeWorkspace(Device dev, void* data) final; static CPUDeviceAPI* Global() { // NOTE: explicitly use new to avoid exit-time destruction of global state @@ -83,7 +82,7 @@ class CPUDeviceAPI final : public DeviceAPI { protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); } @@ -93,12 +92,12 @@ struct CPUWorkspacePool : public WorkspacePool { CPUWorkspacePool() : WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {} }; -void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { - return dmlc::ThreadLocalStore::Get()->AllocWorkspace(ctx, size); +void* CPUDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { + return dmlc::ThreadLocalStore::Get()->AllocWorkspace(dev, size); } -void CPUDeviceAPI::FreeWorkspace(TVMContext ctx, void* data) { - dmlc::ThreadLocalStore::Get()->FreeWorkspace(ctx, data); +void CPUDeviceAPI::FreeWorkspace(Device dev, void* data) { + dmlc::ThreadLocalStore::Get()->FreeWorkspace(dev, data); } TVM_REGISTER_GLOBAL("device_api.cpu").set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/crt/common/crt_backend_api.c b/src/runtime/crt/common/crt_backend_api.c index 9656b07a76815..9a12bc28240a3 100644 --- a/src/runtime/crt/common/crt_backend_api.c +++ b/src/runtime/crt/common/crt_backend_api.c @@ -36,9 +36,9 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_bits_hint) { tvm_crt_error_t err = kTvmErrorNoError; void* ptr = 0; - DLContext ctx = {device_type, device_id}; + DLDevice dev = {device_type, device_id}; assert(nbytes > 0); - err = TVMPlatformMemoryAllocate(nbytes, ctx, &ptr); + err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr); CHECK_EQ(err, kTvmErrorNoError, "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type, device_id, nbytes, dtype_code_hint, dtype_bits_hint, err); @@ -47,8 +47,8 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { tvm_crt_error_t err = kTvmErrorNoError; - DLContext ctx = {device_type, device_id}; - err = TVMPlatformMemoryFree(ptr, ctx); + DLDevice dev = {device_type, device_id}; + err = TVMPlatformMemoryFree(ptr, dev); return err; } diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index c2eb1ff903e3e..6634cfa7db89c 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -65,11 +65,11 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_ dtype.code = dtype_code; dtype.bits = dtype_bits; dtype.lanes = dtype_lanes; - DLContext ctx; - ctx.device_type = (DLDeviceType)device_type; - ctx.device_id = device_id; + DLDevice dev; + dev.device_type = (DLDeviceType)device_type; + dev.device_id = device_id; TVMNDArray arr; - int status = TVMNDArray_Empty(ndim, shape, dtype, ctx, &arr); + int status = TVMNDArray_Empty(ndim, shape, dtype, dev, &arr); if (status != 0) { return status; } @@ -83,16 +83,16 @@ int TVMArrayFree(TVMArrayHandle handle) { return TVMNDArray_Release(&arr); } -int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint, +int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data) { if (alignment != 1) { nbytes = (nbytes + alignment - 1) / alignment * alignment; } - return TVMPlatformMemoryAllocate(nbytes, ctx, out_data); + return TVMPlatformMemoryAllocate(nbytes, dev, out_data); } -int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, - DLDataType dtype, const char* mem_scope, void** out_data) { +int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype, + const char* mem_scope, void** out_data) { size_t nbytes = 1; for (int i = 0; i < ndim; ++i) { nbytes *= shape[i]; @@ -102,10 +102,10 @@ int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* sha int kAllocAlignment = 128; size_t align = (dtype.bits / 8) * dtype.lanes; if (align < kAllocAlignment) align = kAllocAlignment; - return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data); + return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data); } -int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); } +int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { return TVMPlatformMemoryFree(ptr, dev); } static bool IsContiguous(const DLTensor* arr) { if (arr->strides == NULL) return true; @@ -237,13 +237,13 @@ static int DecodeFunctionHandle(TVMFunctionHandle handle, tvm_module_index_t* mo } int TVMByteArrayFree(TVMByteArray* arr) { - DLContext ctx = {kDLCPU, 0}; - int to_return = TVMPlatformMemoryFree((void*)arr->data, ctx); + DLDevice dev = {kDLCPU, 0}; + int to_return = TVMPlatformMemoryFree((void*)arr->data, dev); if (to_return != 0) { return to_return; } - return TVMPlatformMemoryFree((void*)arr, ctx); + return TVMPlatformMemoryFree((void*)arr, dev); } tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* args, @@ -376,18 +376,18 @@ tvm_crt_error_t TVMInitializeRuntime() { tvm_crt_error_t error = kTvmErrorNoError; void* func_registry_memory = NULL; - DLContext ctx = {kDLCPU, 0}; - error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, ctx, + DLDevice dev = {kDLCPU, 0}; + error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, dev, &func_registry_memory); if (error != kTvmErrorNoError) { return error; } void* registry_backing_memory; - error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, ctx, + error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, dev, ®istry_backing_memory); if (error != kTvmErrorNoError) { - TVMPlatformMemoryFree(func_registry_memory, ctx); + TVMPlatformMemoryFree(func_registry_memory, dev); return error; } @@ -412,8 +412,8 @@ tvm_crt_error_t TVMInitializeRuntime() { } if (error != kTvmErrorNoError) { - TVMPlatformMemoryFree(registry_backing_memory, ctx); - TVMPlatformMemoryFree(func_registry_memory, ctx); + TVMPlatformMemoryFree(registry_backing_memory, dev); + TVMPlatformMemoryFree(func_registry_memory, dev); } return error; @@ -422,7 +422,7 @@ tvm_crt_error_t TVMInitializeRuntime() { typedef struct { uint16_t function_index; TVMFunctionHandle func_to_time; - TVMContext ctx; + DLDevice device; int number; int repeat; int min_repeat_ms; @@ -447,8 +447,8 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re TVMModuleHandle mod = (TVMModuleHandle)args[0].v_handle; const char* name = args[1].v_str; - g_time_evaluator_state.ctx.device_type = args[2].v_int64; - g_time_evaluator_state.ctx.device_id = args[3].v_int64; + g_time_evaluator_state.device.device_type = args[2].v_int64; + g_time_evaluator_state.device.device_id = args[3].v_int64; g_time_evaluator_state.number = args[4].v_int64; g_time_evaluator_state.repeat = args[5].v_int64; g_time_evaluator_state.min_repeat_ms = args[6].v_int64; @@ -474,16 +474,16 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* } // TODO(areusch): should *really* rethink needing to return doubles - DLContext result_byte_ctx = {kDLCPU, 0}; + DLDevice result_byte_dev = {kDLCPU, 0}; TVMByteArray* result_byte_arr = NULL; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(sizeof(TVMByteArray), result_byte_ctx, (void*)&result_byte_arr); + TVMPlatformMemoryAllocate(sizeof(TVMByteArray), result_byte_dev, (void*)&result_byte_arr); if (err != kTvmErrorNoError) { goto release_and_return; } result_byte_arr->data = NULL; size_t data_size = sizeof(double) * g_time_evaluator_state.repeat; - err = TVMPlatformMemoryAllocate(data_size, result_byte_ctx, (void*)&result_byte_arr->data); + err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void*)&result_byte_arr->data); if (err != kTvmErrorNoError) { goto release_and_return; } @@ -527,9 +527,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* release_and_return : { tvm_crt_error_t release_err = - TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_ctx); + TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_dev); if (release_err != kTvmErrorNoError) { - release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_ctx); + release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_dev); } if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) { diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c index c90a4667903c4..fb8fc8022f430 100644 --- a/src/runtime/crt/common/ndarray.c +++ b/src/runtime/crt/common/ndarray.c @@ -30,26 +30,26 @@ #include "crt_config.h" -int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx, +int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev, TVMNDArray* array) { memset(array, 0, sizeof(TVMNDArray)); array->dl_tensor.ndim = ndim; tvm_crt_error_t err; - DLContext dlctx = {kDLCPU, 0}; - err = TVMPlatformMemoryAllocate(sizeof(int64_t) * ndim, dlctx, (void*)&array->dl_tensor.shape); + DLDevice dldev = {kDLCPU, 0}; + err = TVMPlatformMemoryAllocate(sizeof(int64_t) * ndim, dldev, (void*)&array->dl_tensor.shape); if (err != kTvmErrorNoError) { return -1; } memcpy(array->dl_tensor.shape, shape, sizeof(int64_t) * ndim); array->dl_tensor.dtype = dtype; - array->dl_tensor.ctx = ctx; + array->dl_tensor.device = dev; array->dl_tensor.data = 0; return 0; } -int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx, +int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev, TVMNDArray* array) { - int status = TVMNDArray_Create(ndim, shape, dtype, ctx, array); + int status = TVMNDArray_Create(ndim, shape, dtype, dev, array); if (status != 0) { return status; } @@ -76,11 +76,11 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { } memcpy(&reserved, *strm, sizeof(reserved)); *strm += sizeof(reserved); - DLContext ctx; + DLDevice dev; int ndim; // sizeof ndim should match dlpack DLDataType dtype; - memcpy(&ctx, *strm, sizeof(ctx)); - *strm += sizeof(ctx); + memcpy(&dev, *strm, sizeof(dev)); + *strm += sizeof(dev); memcpy(&ndim, *strm, sizeof(ndim)); *strm += sizeof(ndim); memcpy(&dtype, *strm, sizeof(dtype)); @@ -89,8 +89,8 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { fprintf(stderr, "Invalid ndim=%d: expected to be 0 ~ %d.\n", ndim, TVM_CRT_MAX_NDIM); status = -1; } - if (ctx.device_type != kDLCPU) { - fprintf(stderr, "Invalid DLTensor context: can only save as CPU tensor\n"); + if (dev.device_type != kDLCPU) { + fprintf(stderr, "Invalid DLTensor device: can only save as CPU tensor\n"); status = -1; } int64_t shape[TVM_CRT_MAX_NDIM] = {0}; @@ -101,7 +101,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { *strm += sizeof(shape[idx]); } } - status = TVMNDArray_Empty(ndim, shape, dtype, ctx, ret); + status = TVMNDArray_Empty(ndim, shape, dtype, dev, ret); if (status != 0) { return status; } @@ -128,7 +128,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype, TVMNDArray* array_view) { - int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.ctx, array_view); + int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.device, array_view); if (status != 0) { return status; } @@ -138,15 +138,15 @@ int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndi int TVMNDArray_Release(TVMNDArray* arr) { tvm_crt_error_t err; - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; - err = TVMPlatformMemoryFree(arr->dl_tensor.data, ctx); + err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev); if (err != kTvmErrorNoError) { return err; } arr->dl_tensor.data = 0; - err = TVMPlatformMemoryFree(arr->dl_tensor.shape, ctx); + err = TVMPlatformMemoryFree(arr->dl_tensor.shape, dev); if (err != kTvmErrorNoError) { return err; } diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c index a79be56636598..f0a1eb2da870b 100644 --- a/src/runtime/crt/graph_runtime/graph_runtime.c +++ b/src/runtime/crt/graph_runtime/graph_runtime.c @@ -137,9 +137,9 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { fprintf(stderr, "error determining inputs array length\n"); break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_inputs, - ctx, (void**)&node->inputs); + dev, (void**)&node->inputs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -214,8 +214,8 @@ int TVMGraphRuntimeNodeRelease(TVMGraphRuntimeNode* node) { return 0; } if (node->inputs) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(node->inputs, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(node->inputs, dev); node->inputs = 0; if (err != kTvmErrorNoError) { return -1; @@ -264,9 +264,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re status = -1; break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_DLTYPE * num_items, ctx, (void**)&attr->dltype); + TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_DLTYPE * num_items, dev, (void**)&attr->dltype); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -322,9 +322,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re status = -1; break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->storage_id); + TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->storage_id); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -374,15 +374,15 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re status = -1; break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate( - sizeof(int64_t) * TVM_CRT_MAX_NDIM * num_items, ctx, (void**)&attr->shape); + sizeof(int64_t) * TVM_CRT_MAX_NDIM * num_items, dev, (void**)&attr->shape); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; break; } - err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->ndim); + err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->ndim); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; @@ -449,9 +449,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re status = -1; break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->device_index); + TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->device_index); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; @@ -525,40 +525,40 @@ int TVMGraphRuntimeGraphAttr_Release(TVMGraphRuntimeGraphAttr* attr) { return 0; } if (attr->storage_id) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(attr->storage_id, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(attr->storage_id, dev); attr->storage_id = 0; if (err != kTvmErrorNoError) { return -1; } } if (attr->device_index) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(attr->device_index, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(attr->device_index, dev); attr->device_index = 0; if (err != kTvmErrorNoError) { return -1; } } if (attr->dltype) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(attr->dltype, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(attr->dltype, dev); attr->dltype = 0; if (err != kTvmErrorNoError) { return -1; } } if (attr->shape) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(attr->shape, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(attr->shape, dev); attr->shape = 0; if (err != kTvmErrorNoError) { return -1; } } if (attr->ndim) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(attr->ndim, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(attr->ndim, dev); attr->ndim = 0; if (err != kTvmErrorNoError) { return -1; @@ -582,8 +582,8 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { status = -1; break; } - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNode) * num_items, ctx, + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNode) * num_items, dev, (void**)&runtime->nodes); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); @@ -617,8 +617,8 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { status = -1; break; } - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&runtime->input_nodes); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); @@ -644,8 +644,8 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { status = -1; break; } - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&runtime->node_row_ptr); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); @@ -672,9 +672,9 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { status = -1; break; } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_items, - ctx, (void**)&runtime->outputs); + dev, (void**)&runtime->outputs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; @@ -788,9 +788,9 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, // read names char* names = NULL; - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * runtime->nodes_count, ctx, (void**)&names); + TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * runtime->nodes_count, dev, (void**)&names); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; @@ -835,14 +835,14 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, } if (runtime->data_entry[eid].dl_tensor.shape) { - err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.shape, ctx); + err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.shape, dev); if (err != kTvmErrorNoError) { status = -1; } runtime->data_entry[eid].dl_tensor.shape = 0; } if (runtime->data_entry[eid].dl_tensor.data) { - err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.data, ctx); + err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.data, dev); if (err != kTvmErrorNoError) { status = -1; } @@ -858,7 +858,7 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, } // Release memory - err = TVMPlatformMemoryFree(names, ctx); + err = TVMPlatformMemoryFree(names, dev); if (err != kTvmErrorNoError) { status = -1; return status; @@ -926,9 +926,9 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { // Grab saved optimization plan from graph. TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs); DLDataType* vtype = NULL; - DLContext alloc_ctx = {kDLCPU, 0}; + DLDevice alloc_dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(DLDataType) * attrs->dltype_count, - alloc_ctx, (void**)&vtype); + alloc_dev, (void**)&vtype); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -940,7 +940,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { // Size and device type of each storage pool entry. TVMGraphRuntimePoolEntry* pool_entry = NULL; err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimePoolEntry) * runtime->nodes_count, - alloc_ctx, (void**)&pool_entry); + alloc_dev, (void**)&pool_entry); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -951,7 +951,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { for (idx = 0; idx < attrs->shape_count; idx++) { int storage_id = attrs->storage_id[idx]; // Use the fallback device if no device index is available. - int device_type = runtime->ctxs[0].device_type; + int device_type = runtime->devices[0].device_type; uint32_t size = Shape_Accumulate(attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx]); DLDataType t = vtype[idx]; uint32_t bits = t.bits * t.lanes; @@ -967,7 +967,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } // Allocate the space. - err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeStorageEntry) * pool_entry_count, alloc_ctx, + err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeStorageEntry) * pool_entry_count, alloc_dev, (void**)&runtime->storage_pool); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); @@ -975,7 +975,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } for (idx = 0; idx < pool_entry_count; idx++) { TVMGraphRuntimePoolEntry pit = pool_entry[idx]; - TVMContext ctx = runtime->ctxs[0]; + DLDevice dev = runtime->devices[0]; uint8_t did_find_linked_param = 0; if (lookup_linked_param_valid) { lookup_linked_param.args.values[0].v_int64 = idx; @@ -986,7 +986,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1; DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor; tensor->data = linked_param_data; - tensor->ctx = ctx; + tensor->device = dev; tensor->ndim = attrs->ndim[pit.entry_id]; tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM; tensor->strides = NULL; @@ -1000,7 +1000,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { 0, }; shape[0] = (pit.size + 3) / 4; - int status = TVMNDArray_Empty(1, shape, dtype, ctx, + int status = TVMNDArray_Empty(1, shape, dtype, dev, &runtime->storage_pool[runtime->storage_pool_count].array); CHECK_EQ(status, 0, "fail to create storage_pool with idx=%d\n", idx); } @@ -1011,7 +1011,7 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { // memory assignment for each node entry. The allocated memory on each device // is mapped to this pool. runtime->data_entry_count = runtime->node_row_ptr[runtime->node_row_ptr_count - 1]; - err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * runtime->data_entry_count, alloc_ctx, + err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * runtime->data_entry_count, alloc_dev, (void**)&runtime->data_entry); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); @@ -1027,13 +1027,13 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } // Release memory - err = TVMPlatformMemoryFree(vtype, alloc_ctx); + err = TVMPlatformMemoryFree(vtype, alloc_dev); if (err != kTvmErrorNoError) { fprintf(stderr, "memory free error: %08x", err); return err; } - err = TVMPlatformMemoryFree(pool_entry, alloc_ctx); + err = TVMPlatformMemoryFree(pool_entry, alloc_dev); if (err != kTvmErrorNoError) { fprintf(stderr, "memory free error: %08x", err); return -1; @@ -1046,9 +1046,9 @@ int TVMGraphRuntime_SetupOpExecs(TVMGraphRuntime* runtime) { int status = 0; uint32_t nid, idx; runtime->op_execs_count = runtime->nodes_count; - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMPackedFunc) * runtime->op_execs_count, - ctx, (void**)&runtime->op_execs); + dev, (void**)&runtime->op_execs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; @@ -1143,16 +1143,16 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* } /*! - * \brief Initialize the graph executor with graph and context. + * \brief Initialize the graph executor with graph and device. * \param graph_json The execution graph. * \param module_handle The module containing the compiled functions for the host * processor. - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The device of the host and devices where graph nodes will be * executed on. * \return 0 on success. */ int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, - TVMModuleHandle module_handle, const TVMContext* ctxs) { + TVMModuleHandle module_handle, const DLDevice* devs) { JSONReader reader; tvm_crt_error_t err = JSONReader_Create(graph_json, &reader); if (err != kTvmErrorNoError) { @@ -1165,7 +1165,7 @@ int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, return -1; } runtime->module_handle = module_handle; - runtime->ctxs[0] = ctxs[0]; + runtime->devices[0] = devs[0]; int status; status = TVMGraphRuntime_SetupStorage(runtime); @@ -1185,9 +1185,9 @@ int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, } int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle, - const TVMContext* ctxs, TVMGraphRuntime** runtime) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntime), ctx, (void**)runtime); + const DLDevice* devs, TVMGraphRuntime** runtime) { + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntime), dev, (void**)runtime); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -1195,7 +1195,7 @@ int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle, memset(*runtime, 0, sizeof(TVMGraphRuntime)); // init - return TVMGraphRuntime_Init(*runtime, sym_json, module_handle, ctxs); + return TVMGraphRuntime_Init(*runtime, sym_json, module_handle, devs); } int TVMGraphRuntime_Release(TVMGraphRuntime** pptr) { @@ -1208,8 +1208,8 @@ int TVMGraphRuntime_Release(TVMGraphRuntime** pptr) { return status; } } - DLContext ctx = {kDLCPU, 0}; - status = TVMPlatformMemoryFree(runtime->nodes, ctx); + DLDevice dev = {kDLCPU, 0}; + status = TVMPlatformMemoryFree(runtime->nodes, dev); if (status != 0) { return status; } @@ -1226,42 +1226,42 @@ int TVMGraphRuntime_Release(TVMGraphRuntime** pptr) { } } for (idx = 0; idx < runtime->data_entry_count; ++idx) { - status = TVMPlatformMemoryFree(runtime->data_entry[idx].dl_tensor.shape, ctx); + status = TVMPlatformMemoryFree(runtime->data_entry[idx].dl_tensor.shape, dev); if (status != 0) { return status; } } - status = TVMPlatformMemoryFree(runtime->input_nodes, ctx); + status = TVMPlatformMemoryFree(runtime->input_nodes, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->node_row_ptr, ctx); + status = TVMPlatformMemoryFree(runtime->node_row_ptr, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->outputs, ctx); + status = TVMPlatformMemoryFree(runtime->outputs, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->storage_pool, ctx); + status = TVMPlatformMemoryFree(runtime->storage_pool, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->data_entry, ctx); + status = TVMPlatformMemoryFree(runtime->data_entry, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->op_execs, ctx); + status = TVMPlatformMemoryFree(runtime->op_execs, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(*pptr, ctx); + status = TVMPlatformMemoryFree(*pptr, dev); if (status != 0) { return status; } if (g_fexecs) { - status = TVMPlatformMemoryFree(g_fexecs, ctx); + status = TVMPlatformMemoryFree(g_fexecs, dev); g_fexecs = 0; if (status != 0) { return status; diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_runtime/load_json.c index 3d1fb601a355c..65dcb3eccb445 100644 --- a/src/runtime/crt/graph_runtime/load_json.c +++ b/src/runtime/crt/graph_runtime/load_json.c @@ -85,15 +85,15 @@ void SeqPop(Seq* seq) { } tvm_crt_error_t SeqCreate(uint64_t len, Seq** seq) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(Seq), ctx, (void**)seq); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(Seq), dev, (void**)seq); if (err != kTvmErrorNoError) { return err; } memset(*seq, 0, sizeof(Seq)); (*seq)->allocated = len; - err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * len, ctx, (void**)&(*seq)->data); + err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * len, dev, (void**)&(*seq)->data); if (err != kTvmErrorNoError) { return err; } @@ -104,12 +104,12 @@ tvm_crt_error_t SeqCreate(uint64_t len, Seq** seq) { } tvm_crt_error_t SeqRelease(Seq* seq) { - DLContext ctx = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryFree(seq->data, ctx); + DLDevice dev = {kDLCPU, 0}; + tvm_crt_error_t err = TVMPlatformMemoryFree(seq->data, dev); if (err != kTvmErrorNoError) { return err; } - return TVMPlatformMemoryFree(seq, ctx); + return TVMPlatformMemoryFree(seq, dev); } // implementations of JSONReader @@ -472,8 +472,8 @@ tvm_crt_error_t JSONReader_Create(const char* is, JSONReader* reader) { reader->NextObjectItem = JSONReader_NextObjectItem; reader->ArrayLength = JSONReader_ArrayLength; - DLContext ctx = {kDLCPU, 0}; - err = TVMPlatformMemoryAllocate(strlen(is) + 1, ctx, (void**)&reader->is_); + DLDevice dev = {kDLCPU, 0}; + err = TVMPlatformMemoryAllocate(strlen(is) + 1, dev, (void**)&reader->is_); if (err != kTvmErrorNoError) { return err; } @@ -490,6 +490,6 @@ tvm_crt_error_t JSONReader_Release(JSONReader* reader) { return err; } - DLContext ctx = {kDLCPU, 0}; - return TVMPlatformMemoryFree(reader->is_, ctx); + DLDevice dev = {kDLCPU, 0}; + return TVMPlatformMemoryFree(reader->is_, dev); } diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c index 8f479e9108f31..4a61b89528adb 100644 --- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c +++ b/src/runtime/crt/graph_runtime_module/graph_runtime_module.c @@ -57,9 +57,9 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM return kTvmErrorGraphModuleBadContext; } - TVMContext ctx = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64}; + DLDevice dev = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64}; int ret_value = - TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx, &graph_runtime.runtime); + TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &dev, &graph_runtime.runtime); if (ret_value != 0) { return ret_value; } diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc index bf36deacb938c..15e696b59f46a 100644 --- a/src/runtime/crt/host/main.cc +++ b/src/runtime/crt/host/main.cc @@ -61,12 +61,12 @@ void TVMPlatformAbort(tvm_crt_error_t error_code) { MemoryManagerInterface* memory_manager; -tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) { - return memory_manager->Allocate(memory_manager, num_bytes, ctx, out_ptr); +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { + return memory_manager->Allocate(memory_manager, num_bytes, dev, out_ptr); } -tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) { - return memory_manager->Free(memory_manager, ptr, ctx); +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { + return memory_manager->Free(memory_manager, ptr, dev); } steady_clock::time_point g_utvm_start_time; diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h index bfe83b59b5bc3..f878477e7b421 100644 --- a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h +++ b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h @@ -41,10 +41,10 @@ typedef struct TVMNDArray { DLTensor dl_tensor; } TVMNDArray; -int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx, +int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev, TVMNDArray* array); -int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx, +int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev, TVMNDArray* array); int TVMNDArray_Load(TVMNDArray* ret, const char** strm); diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h index 8e0faaa4f1992..a64076ab61e00 100644 --- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h +++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h @@ -91,8 +91,8 @@ typedef struct TVMGraphRuntime { /*! \brief The code module that contains both host and device code. */ TVMModuleHandle module_handle; /*! \brief Execution context of all devices including the host. */ - TVMContext ctxs[1]; - uint32_t ctxs_count; + DLDevice devices[1]; + uint32_t devices_count; /*! \brief Common storage pool for all devices. */ TVMGraphRuntimeStorageEntry* storage_pool; uint32_t storage_pool_count; diff --git a/src/runtime/crt/memory/memory.c b/src/runtime/crt/memory/memory.c index 105d07870842a..ed18544c2181f 100644 --- a/src/runtime/crt/memory/memory.c +++ b/src/runtime/crt/memory/memory.c @@ -124,7 +124,7 @@ void MultiMap_Insert(struct MultiMap* map, uint32_t npage, Page* p) { * \return The virtual address */ tvm_crt_error_t MemoryManager_Allocate(MemoryManagerInterface* interface, size_t num_bytes, - DLContext ctx, void** out_ptr) { + DLDevice dev, void** out_ptr) { MemoryManager* mgr = (MemoryManager*)interface; *out_ptr = 0; @@ -256,10 +256,10 @@ tvm_crt_error_t MemoryManager_Realloc(MemoryManagerInterface* interface, void** * \brief Free the memory. * \param interface Pointer to this structure. * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd. - * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. + * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}. * \return kTvmErrorNoError if successful; a descriptive error code otherwise. */ -tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLContext ctx) { +tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLDevice dev) { MemoryManager* mgr = (MemoryManager*)interface; TLB* pmap = &(mgr->pmap); diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc index 0b9e96cd660fd..8b7c0eb01840a 100644 --- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc +++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc @@ -208,15 +208,15 @@ utvm_rpc_server_t UTvmRpcServerInit(utvm_rpc_channel_write_t write_func, void* w TVMPlatformAbort(err); } - DLContext ctx = {kDLCPU, 0}; + DLDevice dev = {kDLCPU, 0}; void* receive_buffer_memory; - err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, ctx, &receive_buffer_memory); + err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, dev, &receive_buffer_memory); if (err != kTvmErrorNoError) { TVMPlatformAbort(err); } auto receive_buffer = new (receive_buffer_memory) uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES]; void* rpc_server_memory; - err = TVMPlatformMemoryAllocate(sizeof(tvm::runtime::micro_rpc::MicroRPCServer), ctx, + err = TVMPlatformMemoryAllocate(sizeof(tvm::runtime::micro_rpc::MicroRPCServer), dev, &rpc_server_memory); if (err != kTvmErrorNoError) { TVMPlatformAbort(err); diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index f156d68d283ea..5d6ff2b263c2a 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -37,56 +37,56 @@ namespace runtime { class CUDADeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final { CUDA_CALL(cudaSetDevice(ctx.device_id)); } - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + void SetDevice(Device dev) final { CUDA_CALL(cudaSetDevice(dev.device_id)); } + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { int value = 0; switch (kind) { case kExist: - value = (cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) == + value = (cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id) == cudaSuccess); break; case kMaxThreadsPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id)); break; } case kWarpSize: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id)); break; } case kMaxSharedMemoryPerBlock: { CUDA_CALL( - cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id)); + cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, dev.device_id)); break; } case kComputeVersion: { std::ostringstream os; - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id)); os << value << "."; - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id)); os << value; *rv = os.str(); return; } case kDeviceName: { std::string name(256, 0); - CUDA_DRIVER_CALL(cuDeviceGetName(&name[0], name.size(), ctx.device_id)); + CUDA_DRIVER_CALL(cuDeviceGetName(&name[0], name.size(), dev.device_id)); name.resize(strlen(name.c_str())); *rv = std::move(name); return; } case kMaxClockRate: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id)); break; } case kMultiProcessorCount: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id)); break; } case kMaxThreadDimensions: { int dims[3]; - CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id)); - CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id)); - CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id)); std::stringstream ss; // use json string to return multiple int values; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; @@ -94,7 +94,7 @@ class CUDADeviceAPI final : public DeviceAPI { return; } case kMaxRegistersPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id)); break; } case kGcnArch: @@ -106,62 +106,61 @@ class CUDADeviceAPI final : public DeviceAPI { } *rv = value; } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { ICHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes"; void* ret; - if (ctx.device_type == kDLCPUPinned) { + if (dev.device_type == kDLCPUPinned) { CUDA_CALL(cudaMallocHost(&ret, nbytes)); } else { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + CUDA_CALL(cudaSetDevice(dev.device_id)); CUDA_CALL(cudaMalloc(&ret, nbytes)); } return ret; } - void FreeDataSpace(TVMContext ctx, void* ptr) final { - if (ctx.device_type == kDLCPUPinned) { + void FreeDataSpace(Device dev, void* ptr) final { + if (dev.device_type == kDLCPUPinned) { CUDA_CALL(cudaFreeHost(ptr)); } else { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + CUDA_CALL(cudaSetDevice(dev.device_id)); CUDA_CALL(cudaFree(ptr)); } } protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { cudaStream_t cu_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; - if (ctx_from.device_type == kDLCPUPinned) { - ctx_from.device_type = kDLCPU; + if (dev_from.device_type == kDLCPUPinned) { + dev_from.device_type = kDLCPU; } - if (ctx_to.device_type == kDLCPUPinned) { - ctx_to.device_type = kDLCPU; + if (dev_to.device_type == kDLCPUPinned) { + dev_to.device_type = kDLCPU; } // In case there is a copy from host mem to host mem */ - if (ctx_to.device_type == kDLCPU && ctx_from.device_type == kDLCPU) { + if (dev_to.device_type == kDLCPU && dev_from.device_type == kDLCPU) { memcpy(to, from, size); return; } - if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) { - CUDA_CALL(cudaSetDevice(ctx_from.device_id)); - if (ctx_from.device_id == ctx_to.device_id) { + if (dev_from.device_type == kDLGPU && dev_to.device_type == kDLGPU) { + CUDA_CALL(cudaSetDevice(dev_from.device_id)); + if (dev_from.device_id == dev_to.device_id) { GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); } else { - cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream); + cudaMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, cu_stream); } - } else if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLCPU) { - CUDA_CALL(cudaSetDevice(ctx_from.device_id)); + } else if (dev_from.device_type == kDLGPU && dev_to.device_type == kDLCPU) { + CUDA_CALL(cudaSetDevice(dev_from.device_id)); GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); - } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLGPU) { - CUDA_CALL(cudaSetDevice(ctx_to.device_id)); + } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLGPU) { + CUDA_CALL(cudaSetDevice(dev_to.device_id)); GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); } else { LOG(FATAL) << "expect copy from/to GPU or between GPU"; @@ -169,21 +168,21 @@ class CUDADeviceAPI final : public DeviceAPI { } public: - TVMStreamHandle CreateStream(TVMContext ctx) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + TVMStreamHandle CreateStream(Device dev) { + CUDA_CALL(cudaSetDevice(dev.device_id)); cudaStream_t retval; CUDA_CALL(cudaStreamCreate(&retval)); return static_cast(retval); } - void FreeStream(TVMContext ctx, TVMStreamHandle stream) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + void FreeStream(Device dev, TVMStreamHandle stream) { + CUDA_CALL(cudaSetDevice(dev.device_id)); cudaStream_t cu_stream = static_cast(stream); CUDA_CALL(cudaStreamDestroy(cu_stream)); } - void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { + CUDA_CALL(cudaSetDevice(dev.device_id)); cudaStream_t src_stream = static_cast(event_src); cudaStream_t dst_stream = static_cast(event_dst); cudaEvent_t evt; @@ -193,21 +192,21 @@ class CUDADeviceAPI final : public DeviceAPI { CUDA_CALL(cudaEventDestroy(evt)); } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + void StreamSync(Device dev, TVMStreamHandle stream) final { + CUDA_CALL(cudaSetDevice(dev.device_id)); CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); } - void SetStream(TVMContext ctx, TVMStreamHandle stream) final { + void SetStream(Device dev, TVMStreamHandle stream) final { CUDAThreadEntry::ThreadLocal()->stream = static_cast(stream); } - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { - return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { + return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); } - void FreeWorkspace(TVMContext ctx, void* data) final { - CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); + void FreeWorkspace(Device dev, void* data) final { + CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); } static CUDADeviceAPI* Global() { @@ -275,7 +274,7 @@ class GPUTimerNode : public TimerNode { TVM_REGISTER_OBJECT_TYPE(GPUTimerNode); -TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](TVMContext ctx) { +TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](Device dev) { return Timer(make_object()); }); diff --git a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc index ee5e50a3b9d4b..3ad8453c4903b 100644 --- a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc +++ b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc @@ -45,10 +45,10 @@ class GraphRuntimeCudaGraph : public GraphRuntime { * \brief Begin CUDA graph capture on stream, the stream enters capture mode. */ void StartCapture() { - const TVMContext& ctx = data_entry_[entry_id(0, 0)]->ctx; + const Device& dev = data_entry_[entry_id(0, 0)]->device; - TVMStreamCreate(ctx.device_type, ctx.device_id, &capture_stream_); - TVMSetStream(ctx.device_type, ctx.device_id, capture_stream_); + TVMStreamCreate(dev.device_type, dev.device_id, &capture_stream_); + TVMSetStream(dev.device_type, dev.device_id, capture_stream_); CUDA_CALL(cudaStreamBeginCapture(static_cast(capture_stream_), cudaStreamCaptureModeGlobal)); @@ -109,10 +109,10 @@ PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name, } Module GraphRuntimeCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& ctxs, + const std::vector& devs, PackedFunc lookup_linked_param_func) { auto exec = make_object(); - exec->Init(sym_json, m, ctxs, lookup_linked_param_func); + exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } @@ -122,13 +122,13 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create") "at least 4, but it has " << args.num_args; PackedFunc lookup_linked_param_func; - int ctx_start_arg = 2; + int dev_start_arg = 2; if (args[2].type_code() == kTVMPackedFuncHandle) { lookup_linked_param_func = args[2]; - ctx_start_arg++; + dev_start_arg++; } - *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllContext(args, ctx_start_arg), + *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), lookup_linked_param_func); }); } // namespace runtime diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc index 0e3003aa42c33..fedaf4f890bcd 100644 --- a/src/runtime/graph/debug/graph_runtime_debug.cc +++ b/src/runtime/graph/debug/graph_runtime_debug.cc @@ -133,7 +133,7 @@ class GraphRuntimeDebug : public GraphRuntime { return 0; } - const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx; + const Device& dev = data_entry_[entry_id(index, 0)]->device; TVMOpParam param = nodes_[index].param; std::string name = param.func_name; uint32_t num_inputs = param.num_inputs; @@ -141,8 +141,8 @@ class GraphRuntimeDebug : public GraphRuntime { PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator") -> - operator()(module_, name, static_cast(ctx.device_type), - ctx.device_id, number, repeat, min_repeat_ms, ""); + operator()(module_, name, static_cast(dev.device_type), + dev.device_id, number, repeat, min_repeat_ms, ""); int num_flat_args = num_inputs + num_outputs; std::unique_ptr values(new TVMValue[num_flat_args]); @@ -171,8 +171,8 @@ class GraphRuntimeDebug : public GraphRuntime { } Timer RunOpHost(int index) { - const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx; - Timer t = Timer::Start(ctx); + const Device& dev = data_entry_[entry_id(index, 0)]->device; + Timer t = Timer::Start(dev); op_execs_[index](); t->Stop(); return t; @@ -269,13 +269,13 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name, * \brief GraphRuntimeDebugCreate Get the function based on input. * \param sym_json The graph symbol in json format. * \param m Compiled module which will be loaded. - * \param ctxs All devices contexts. + * \param devs All devices. */ Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& ctxs, + const std::vector& devs, PackedFunc lookup_linked_param_func) { auto exec = make_object(); - exec->Init(sym_json, m, ctxs, lookup_linked_param_func); + exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } @@ -284,13 +284,13 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, "at least 4, but it has " << args.num_args; PackedFunc lookup_linked_param_func; - int ctx_start_arg = 2; + int dev_start_arg = 2; if (args[2].type_code() == kTVMPackedFuncHandle) { lookup_linked_param_func = args[2]; - ctx_start_arg++; + dev_start_arg++; } - *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args, ctx_start_arg), + *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), lookup_linked_param_func); }); } // namespace runtime diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index b11a57352f547..a4320ee3b9c03 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -60,22 +60,22 @@ void GraphRuntime::Run() { } } /*! - * \brief Initialize the graph executor with graph and context. + * \brief Initialize the graph executor with graph and device. * \param graph_json The execution graph. * \param module The module containing the compiled functions for the host * processor. - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The devices of the host and devices where graph nodes will be * executed on. * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr. */ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, + const std::vector& devs, const PackedFunc lookup_linked_param_func) { std::istringstream is(graph_json); dmlc::JSONReader reader(&is); this->Load(&reader); module_ = module; - ctxs_ = ctxs; + devices_ = devs; lookup_linked_param_ = lookup_linked_param_func; if (lookup_linked_param_ == nullptr) { lookup_linked_param_ = PackedFunc( @@ -125,8 +125,8 @@ void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) { ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref)); ICHECK_EQ(reinterpret_cast(data_ref->data) % kAllocAlignment, 0); ICHECK_EQ(old_t->ndim, static_cast(data_ref->ndim)); - ICHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type); - ICHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id); + ICHECK_EQ(old_t->device.device_type, data_ref->device.device_type); + ICHECK_EQ(old_t->device.device_id, data_ref->device.device_id); for (auto i = 0; i < data_ref->ndim; ++i) { ICHECK_EQ(old_t->shape[i], data_ref->shape[i]); } @@ -243,7 +243,7 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { Module mod = args[0]; int64_t storage_id = args[1]; DLTensor* template_tensor = args[2]; - TVMContext ctx = args[3]; + Device dev = args[3]; // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked // params are present. if (!module_lookup_linked_param_valid_) { @@ -265,7 +265,7 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { template_tensor->shape + template_tensor->ndim}; std::unique_ptr container{new NDArray::Container( - static_cast(opaque_handle), shape_vec, template_tensor->dtype, ctx)}; + static_cast(opaque_handle), shape_vec, template_tensor->dtype, dev)}; container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter); *rv = NDArray(GetObjectPtr(container.release())); } @@ -283,7 +283,7 @@ void GraphRuntime::SetupStorage() { for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; // Use the fallback device if no device index is available. - int device_type = static_cast(ctxs_[0].device_type); + int device_type = static_cast(devices_[0].device_type); if (!attrs_.device_index.empty()) { device_type = attrs_.device_index[i]; } @@ -307,10 +307,10 @@ void GraphRuntime::SetupStorage() { TVMRetValue lookup_rv; { std::vector shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()}; - DLTensor template_tensor{nullptr, TVMContext{kDLCPU, 0}, static_cast(shape_vec.size()), - vtype[i], shape_vec.data(), nullptr, + DLTensor template_tensor{nullptr, Device{kDLCPU, 0}, static_cast(shape_vec.size()), + vtype[i], shape_vec.data(), nullptr, 0}; - lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, ctxs_[0]); + lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, devices_[0]); } if (lookup_rv.type_code() != kTVMNullptr) { pool_entry[sid].linked_param = lookup_rv; @@ -324,16 +324,16 @@ void GraphRuntime::SetupStorage() { for (const auto& pit : pool_entry) { // This for loop is very fast since there are usually only a couple of // devices available on the same hardware. - const auto& cit = std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) { - return pit.device_type == static_cast(c.device_type); + const auto& cit = std::find_if(devices_.begin(), devices_.end(), [&pit](const Device& d) { + return pit.device_type == static_cast(d.device_type); }); - TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit; + Device dev = cit == devices_.end() ? devices_[0] : *cit; if (pit.linked_param.defined()) { storage_pool_.push_back(pit.linked_param); } else { std::vector shape; shape.push_back(static_cast(pit.size + 3) / 4); - storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx)); + storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev)); } } @@ -505,23 +505,23 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name, } Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& ctxs, + const std::vector& devs, const PackedFunc lookup_linked_param_func) { auto exec = make_object(); - exec->Init(sym_json, m, ctxs, lookup_linked_param_func); + exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } -// Get all context for the host and other runtime devices. -std::vector GetAllContext(const TVMArgs& args, int ctx_start_arg) { +// Get all devices for the host and other runtime devices. +std::vector GetAllDevice(const TVMArgs& args, int dev_start_arg) { // Reserve the first item as the fallback device. - std::vector ret; - TVMContext ctx; - for (int i = ctx_start_arg; i < args.num_args; i += 2) { + std::vector ret; + Device dev; + for (int i = dev_start_arg; i < args.num_args; i += 2) { int dev_type = args[i]; - ctx.device_type = static_cast(dev_type); - ctx.device_id = args[i + 1]; - ret.push_back(ctx); + dev.device_type = static_cast(dev_type); + dev.device_id = args[i + 1]; + ret.push_back(dev); } return ret; } @@ -530,19 +530,19 @@ std::vector GetAllContext(const TVMArgs& args, int ctx_start_arg) { // from tvm4j and javascript, since they don't have heterogeneous // execution support yet. For heterogenenous execution, at least 5 arguments will // be passed in. The third one is the number of devices. -// Eventually, we will only probably pass TVMContext for all the languages. +// Eventually, we will only probably pass Device for all the languages. TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRetValue* rv) { ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " "at least 4, but it has " << args.num_args; PackedFunc lookup_linked_param_func; - int ctx_start_arg = 2; + int dev_start_arg = 2; if (args[2].type_code() == kTVMPackedFuncHandle) { lookup_linked_param_func = args[2]; - ctx_start_arg++; + dev_start_arg++; } - const auto& contexts = GetAllContext(args, ctx_start_arg); - *rv = GraphRuntimeCreate(args[0], args[1], contexts, lookup_linked_param_func); + const auto& devices = GetAllDevice(args, dev_start_arg); + *rv = GraphRuntimeCreate(args[0], args[1], devices, lookup_linked_param_func); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h index e417d2aa4bfcc..aeaee9e3483ed 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph/graph_runtime.h @@ -85,11 +85,11 @@ class TVM_DLL GraphRuntime : public ModuleNode { void Run(); /*! - * \brief Initialize the graph executor with graph and context. + * \brief Initialize the graph executor with graph and device. * \param graph_json The execution graph. * \param module The module containing the compiled functions for the host * processor. - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The device of the host and devices where graph nodes will be * executed on. * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters * by storage_id. If not given, linked parameters are looked-up using an internal implementation, @@ -97,8 +97,7 @@ class TVM_DLL GraphRuntime : public ModuleNode { */ void Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, - const PackedFunc lookup_linked_param_func = nullptr); + const std::vector& devs, const PackedFunc lookup_linked_param_func = nullptr); /*! * \brief Get the input index given the name of input. @@ -407,7 +406,7 @@ class TVM_DLL GraphRuntime : public ModuleNode { /*! \brief The code module that contains both host and device code. */ tvm::runtime::Module module_; /*! \brief Execution context of all devices including the host. */ - std::vector ctxs_; + std::vector devices_; /*! \brief Common storage pool for all devices. */ std::vector storage_pool_; /*! \brief Data entry of each node. */ @@ -427,7 +426,7 @@ class TVM_DLL GraphRuntime : public ModuleNode { bool module_lookup_linked_param_valid_; }; -std::vector GetAllContext(const TVMArgs& args, int ctx_start_arg); +std::vector GetAllDevice(const TVMArgs& args, int dev_start_arg); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc index 1682afa8464a0..a6f87bac8a6e1 100644 --- a/src/runtime/graph/graph_runtime_factory.cc +++ b/src/runtime/graph/graph_runtime_factory.cc @@ -47,22 +47,22 @@ PackedFunc GraphRuntimeFactory::GetFunction( const std::string& name, const tvm::runtime::ObjectPtr& sptr_to_self) { if (name == module_name_) { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - std::vector contexts; + std::vector devices; for (int i = 0; i < args.num_args; ++i) { - contexts.emplace_back(args[i].operator TVMContext()); + devices.emplace_back(args[i].operator Device()); } - *rv = this->RuntimeCreate(contexts); + *rv = this->RuntimeCreate(devices); }); } else if (name == "debug_create") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { ICHECK_GE(args.size(), 2); std::string module_name = args[0].operator String(); ICHECK(module_name == module_name_) << "Currently we only support single model for now."; - std::vector contexts; + std::vector devices; for (int i = 1; i < args.num_args; ++i) { - contexts.emplace_back(args[i].operator TVMContext()); + devices.emplace_back(args[i].operator Device()); } - *rv = this->DebugRuntimeCreate(contexts); + *rv = this->DebugRuntimeCreate(devices); }); } else if (name == "remove_params") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -74,11 +74,11 @@ PackedFunc GraphRuntimeFactory::GetFunction( }); } else if (name == "cuda_graph_create") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - std::vector contexts; + std::vector devices; for (int i = 0; i < args.num_args; ++i) { - contexts.emplace_back(args[i].operator TVMContext()); + devices.emplace_back(args[i].operator Device()); } - *rv = this->CudaGraphRuntimeCreate(contexts); + *rv = this->CudaGraphRuntimeCreate(devices); }); } else { return PackedFunc(); @@ -103,32 +103,32 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) { stream->Write(module_name_); } -Module GraphRuntimeFactory::RuntimeCreate(const std::vector& ctxs) { +Module GraphRuntimeFactory::RuntimeCreate(const std::vector& devs) { auto exec = make_object(); - exec->Init(this->graph_json_, this->imports_[0], ctxs, PackedFunc()); + exec->Init(this->graph_json_, this->imports_[0], devs, PackedFunc()); // set params SetParams(exec.get(), this->params_); return Module(exec); } -Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& ctxs) { +Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& devs) { const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_debug.create"); ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. " "Do you enable debug graph runtime build?"; - // Debug runtime create packed function will call GetAllContexs, so we unpack the ctxs. - std::vector unpacked_ctxs; - for (const auto& ctx : ctxs) { - unpacked_ctxs.emplace_back(ctx.device_type); - unpacked_ctxs.emplace_back(ctx.device_id); + // Debug runtime create packed function will call GetAllContexs, so we unpack the devs. + std::vector unpacked_devs; + for (const auto& dev : devs) { + unpacked_devs.emplace_back(dev.device_type); + unpacked_devs.emplace_back(dev.device_id); } - size_t args_size = unpacked_ctxs.size() + 2; + size_t args_size = unpacked_devs.size() + 2; std::vector values(args_size); std::vector codes(args_size); runtime::TVMArgsSetter setter(values.data(), codes.data()); setter(0, this->graph_json_); setter(1, this->imports_[0]); - for (size_t i = 0; i < unpacked_ctxs.size(); ++i) { - setter(i + 2, unpacked_ctxs[i]); + for (size_t i = 0; i < unpacked_devs.size(); ++i) { + setter(i + 2, unpacked_devs[i]); } TVMRetValue rv; pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); @@ -138,23 +138,23 @@ Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& ct return mod; } -Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& ctxs) { +Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& devs) { const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_cuda_graph.create"); ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_cuda_graph.create in registry. " "Did you set(USE_GRAPH_RUNTIME_CUGRAPH=ON)?"; - std::vector unpacked_ctxs; - for (const auto& ctx : ctxs) { - unpacked_ctxs.emplace_back(ctx.device_type); - unpacked_ctxs.emplace_back(ctx.device_id); + std::vector unpacked_devs; + for (const auto& dev : devs) { + unpacked_devs.emplace_back(dev.device_type); + unpacked_devs.emplace_back(dev.device_id); } - size_t args_size = unpacked_ctxs.size() + 2; + size_t args_size = unpacked_devs.size() + 2; std::vector values(args_size); std::vector codes(args_size); runtime::TVMArgsSetter setter(values.data(), codes.data()); setter(0, this->graph_json_); setter(1, this->imports_[0]); - for (size_t i = 0; i < unpacked_ctxs.size(); ++i) { - setter(i + 2, unpacked_ctxs[i]); + for (size_t i = 0; i < unpacked_devs.size(); ++i) { + setter(i + 2, unpacked_devs[i]); } TVMRetValue rv; pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph/graph_runtime_factory.h index f2f11ee66802a..86958218a0f75 100644 --- a/src/runtime/graph/graph_runtime_factory.h +++ b/src/runtime/graph/graph_runtime_factory.h @@ -75,27 +75,27 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { /*! * \brief Create a specific runtime module - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The device of the host and devices where graph nodes will be * executed on. * \return created runtime module */ - Module RuntimeCreate(const std::vector& ctxs); + Module RuntimeCreate(const std::vector& devs); /*! * \brief Create a specific debug runtime module - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The device of the host and devices where graph nodes will be * executed on. * \return created debug runtime module */ - Module DebugRuntimeCreate(const std::vector& ctxs); + Module DebugRuntimeCreate(const std::vector& devs); /*! * \brief Create a specific cuda graph runtime module - * \param ctxs The context of the host and devices where graph nodes will be + * \param devs The device of the host and devices where graph nodes will be * executed on. * \return created cuda graph runtime module */ - Module CudaGraphRuntimeCreate(const std::vector& ctx); + Module CudaGraphRuntimeCreate(const std::vector& devs); /*! * \brief Set params. diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index a01c9def5d5de..a07a7c683026b 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -31,13 +31,13 @@ namespace runtime { class HexagonDeviceAPI : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final; - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; - void FreeDataSpace(TVMContext ctx, void* ptr) final; - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final; - void FreeWorkspace(TVMContext ctx, void* ptr) final; + void SetDevice(Device dev) final; + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final; + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final; + void FreeDataSpace(Device dev, void* ptr) final; + void StreamSync(Device dev, TVMStreamHandle stream) final; + void* AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint = {}) final; + void FreeWorkspace(Device dev, void* ptr) final; static HexagonDeviceAPI* Global() { // NOTE: explicitly use new to avoid destruction of global state @@ -48,33 +48,33 @@ class HexagonDeviceAPI : public DeviceAPI { protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) final; + size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint, + TVMStreamHandle stream) final; }; // HexagonDeviceAPI. -inline void HexagonDeviceAPI::SetDevice(TVMContext ctx) {} +inline void HexagonDeviceAPI::SetDevice(Device dev) {} -inline void HexagonDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { +inline void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) { if (kind == kExist) *rv = 1; } -inline void* HexagonDeviceAPI::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, +inline void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) { - ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id)); + ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id)); return hexagon::Device::Global()->Alloc(nbytes, alignment); } -inline void HexagonDeviceAPI::FreeDataSpace(TVMContext ctx, void* ptr) { - ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id)); +inline void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) { + ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id)); hexagon::Device::Global()->Free(ptr); } inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, - size_t to_offset, size_t num_bytes, - TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) { + size_t to_offset, size_t num_bytes, Device dev_from, + Device dev_to, DLDataType type_hint, + TVMStreamHandle stream) { const char* src = static_cast(from) + from_offset; char* dst = static_cast(to) + to_offset; @@ -83,45 +83,45 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse }; (void)Is32bit; - if (ctx_from.device_type == ctx_to.device_type) { - if (ctx_from.device_type == kDLCPU) { + if (dev_from.device_type == dev_to.device_type) { + if (dev_from.device_type == kDLCPU) { memmove(dst, src, num_bytes); - } else if (static_cast(ctx_from.device_type) == kDLHexagon) { - ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id)); - ICHECK_EQ(ctx_from.device_id, ctx_to.device_id); + } else if (static_cast(dev_from.device_type) == kDLHexagon) { + ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id)); + ICHECK_EQ(dev_from.device_id, dev_to.device_id); ICHECK(Is32bit(dst) && Is32bit(src)); hexagon::Device::Global()->CopyDeviceToDevice(dst, src, num_bytes); } } else { - if (ctx_from.device_type == kDLCPU) { - ICHECK_EQ(static_cast(ctx_to.device_type), kDLHexagon); + if (dev_from.device_type == kDLCPU) { + ICHECK_EQ(static_cast(dev_to.device_type), kDLHexagon); ICHECK(Is32bit(dst)); - ICHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id)); + ICHECK(hexagon::Device::ValidateDeviceId(dev_to.device_id)); hexagon::Device::Global()->CopyHostToDevice(dst, src, num_bytes); } else { - ICHECK_EQ(static_cast(ctx_from.device_type), kDLHexagon); - ICHECK_EQ(ctx_to.device_type, kDLCPU); + ICHECK_EQ(static_cast(dev_from.device_type), kDLHexagon); + ICHECK_EQ(dev_to.device_type, kDLCPU); ICHECK(Is32bit(src)); - ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id)); + ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id)); hexagon::Device::Global()->CopyDeviceToHost(dst, src, num_bytes); } } } -inline void HexagonDeviceAPI::StreamSync(TVMContext ctx, TVMStreamHandle stream) {} +inline void HexagonDeviceAPI::StreamSync(Device dev, TVMStreamHandle stream) {} -inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint) { - ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id)); +inline void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint) { + ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id)); if (type_hint.code == 100) { size_t align = std::min(nbytes, 2048lu); return hexagon::Device::Global()->AllocVtcm(nbytes, align); } - return DeviceAPI::AllocWorkspace(ctx, nbytes, type_hint); + return DeviceAPI::AllocWorkspace(dev, nbytes, type_hint); } -inline void HexagonDeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { - ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id)); - DeviceAPI::FreeWorkspace(ctx, ptr); +inline void HexagonDeviceAPI::FreeWorkspace(Device dev, void* ptr) { + ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id)); + DeviceAPI::FreeWorkspace(dev, ptr); } TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc index f6a57ff55355b..73c830fe5fc96 100644 --- a/src/runtime/hexagon/hexagon_module.cc +++ b/src/runtime/hexagon/hexagon_module.cc @@ -379,7 +379,7 @@ void HexagonModuleNode::RemapArgs(const TVMArgs& args, std::vector& va case kTVMNDArrayHandle: case kTVMDLTensorHandle: { DLTensor* t = static_cast(a); - assert(TVMDeviceExtType(t->ctx.device_type) == kDLHexagon); + assert(TVMDeviceExtType(t->device.device_type) == kDLHexagon); TVMValue v; v.v_handle = CreateRemoteTensor(t); remote_tensors.push_back(v.v_handle); @@ -401,25 +401,25 @@ void* HexagonModuleNode::CreateRemoteTensor(const DLTensor* t) const { Layout of the DLTensor structure on Hexagon. DLTensor: Size offset - data void* 4 0 - ctx.device_type enum 1 4 - 3 5 - ctx.device_id int 4 8 - ndim int 4 12 - dtype.code uint8_t 1 16 - dtype.bits uint8_t 1 17 - dtype.lanes uint16_t 2 18 - shape int64_t* 4 20 - strides int64_t* 4 24 - 4 28 - byte_offset uint64_t 8 32 + data void* 4 0 + device.device_type enum 1 4 + 3 5 + device.device_id int 4 8 + ndim int 4 12 + dtype.code uint8_t 1 16 + dtype.bits uint8_t 1 17 + dtype.lanes uint16_t 2 18 + shape int64_t* 4 20 + strides int64_t* 4 24 + 4 28 + byte_offset uint64_t 8 32 .. end ................................ 40 */ struct __attribute__((packed)) HexagonDLTensor { uint32_t data; - uint8_t ctx_device_type; + uint8_t device_type; uint8_t pad0[3]; // MUST BE ZERO! - int32_t ctx_device_id; + int32_t device_id; int32_t ndim; uint8_t dtype_code; uint8_t dtype_bits; @@ -444,9 +444,9 @@ void* HexagonModuleNode::CreateRemoteTensor(const DLTensor* t) const { HexagonDLTensor local; local.data = static_cast(reinterpret_cast(t->data)); - local.ctx_device_type = uint8_t(t->ctx.device_type); + local.device_type = uint8_t(t->device.device_type); local.pad0[0] = local.pad0[1] = local.pad0[2] = 0; - local.ctx_device_id = t->ctx.device_id; + local.device_id = t->device.device_id; local.ndim = t->ndim; local.dtype_code = t->dtype.code; local.dtype_bits = t->dtype.bits; diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h index 02ed7d2541c25..1288b933410c6 100644 --- a/src/runtime/hexagon/hexagon_module.h +++ b/src/runtime/hexagon/hexagon_module.h @@ -143,7 +143,7 @@ class Device { virtual ~Device() = 0; static std::shared_ptr Global(); - static bool ValidateDeviceId(decltype(DLContext::device_id) device_id) { + static bool ValidateDeviceId(decltype(DLDevice::device_id) device_id) { // Only supporting a single device for now. return device_id == 0; } diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index b5d06192396b2..55f9022a6b96d 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -62,57 +62,57 @@ class MetalWorkspace final : public DeviceAPI { std::mutex mutex; // Destructor ~MetalWorkspace(); - // Get command queue for given context. - id GetCommandQueue(TVMContext ctx) { - ICHECK_EQ(ctx.device_type, kDLMetal); - ICHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < queues.size()) - << "Invalid Metal device_id=" << ctx.device_id; - return queues[ctx.device_id]; + // Get command queue for given device. + id GetCommandQueue(Device dev) { + ICHECK_EQ(dev.device_type, kDLMetal); + ICHECK(dev.device_id >= 0 && static_cast(dev.device_id) < queues.size()) + << "Invalid Metal device_id=" << dev.device_id; + return queues[dev.device_id]; } - // Get device for given context - id GetDevice(TVMContext ctx) { - ICHECK_EQ(ctx.device_type, kDLMetal); - ICHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < devices.size()) - << "Invalid Metal device_id=" << ctx.device_id; - return devices[ctx.device_id]; + // Get device for given device + id GetDevice(Device dev) { + ICHECK_EQ(dev.device_type, kDLMetal); + ICHECK(dev.device_id >= 0 && static_cast(dev.device_id) < devices.size()) + << "Invalid Metal device_id=" << dev.device_id; + return devices[dev.device_id]; } // Initialize workspace // Return false if already initialized, otherwise return true. void Init(); // override device API - void SetDevice(TVMContext ctx) final; - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; - void FreeDataSpace(TVMContext ctx, void* ptr) final; - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; - void FreeWorkspace(TVMContext ctx, void* data) final; + void SetDevice(Device dev) final; + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final; + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final; + void FreeDataSpace(Device dev, void* ptr) final; + void StreamSync(Device dev, TVMStreamHandle stream) final; + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; + void FreeWorkspace(Device dev, void* data) final; // get the global workspace static MetalWorkspace* Global(); protected: void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final; }; /*! \brief Thread local workspace */ class MetalThreadEntry { public: - /*! \brief The current context */ - TVMContext context; + /*! \brief The current device */ + Device device; /*! \brief The shared buffer used for copy. */ std::vector > temp_buffer_; /*! \brief workspace pool */ WorkspacePool pool; // constructor MetalThreadEntry() : pool(static_cast(kDLMetal), MetalWorkspace::Global()) { - context.device_id = 0; - context.device_type = static_cast(kDLMetal); + device.device_id = 0; + device.device_type = static_cast(kDLMetal); } ~MetalThreadEntry(); - // Get temp buffer with at least size under ctx. - id GetTempBuffer(TVMContext ctx, size_t size); + // Get temp buffer with at least size under dev. + id GetTempBuffer(Device dev, size_t size); // get the global workspace static MetalThreadEntry* ThreadLocal(); }; diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 3d7abd1340353..cf8520864e99e 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -38,10 +38,10 @@ } } -void MetalWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { +void MetalWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) { @autoreleasepool { this->Init(); - size_t index = static_cast(ctx.device_id); + size_t index = static_cast(dev.device_id); if (kind == kExist) { *rv = int(index < devices.size()); return; @@ -49,7 +49,7 @@ ICHECK_LT(index, devices.size()) << "Invalid device id " << index; switch (kind) { case kMaxThreadsPerBlock: { - *rv = static_cast([devices[ctx.device_id] maxThreadsPerThreadgroup].width); + *rv = static_cast([devices[dev.device_id] maxThreadsPerThreadgroup].width); break; } case kWarpSize: { @@ -149,15 +149,15 @@ int GetWarpSize(id dev) { #endif } -void MetalWorkspace::SetDevice(TVMContext ctx) { - MetalThreadEntry::ThreadLocal()->context.device_id = ctx.device_id; +void MetalWorkspace::SetDevice(Device dev) { + MetalThreadEntry::ThreadLocal()->device.device_id = dev.device_id; } -void* MetalWorkspace::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, +void* MetalWorkspace::AllocDataSpace(Device device, size_t nbytes, size_t alignment, DLDataType type_hint) { @autoreleasepool { this->Init(); - id dev = GetDevice(ctx); + id dev = GetDevice(device); // GPU memory only MTLResourceOptions storage_mode = MTLResourceStorageModePrivate; /* @@ -173,7 +173,7 @@ int GetWarpSize(id dev) { } } -void MetalWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { +void MetalWorkspace::FreeDataSpace(Device dev, void* ptr) { @autoreleasepool { // MTLBuffer PurgeableState should be set to empty before manual // release in order to prevent memory leak @@ -184,21 +184,20 @@ int GetWarpSize(id dev) { } void MetalWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to, - size_t to_offset, size_t size, TVMContext ctx_from, - TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) { + size_t to_offset, size_t size, Device dev_from, Device dev_to, + DLDataType type_hint, TVMStreamHandle stream) { @autoreleasepool { this->Init(); ICHECK(stream == nullptr); - TVMContext ctx = ctx_from; - if (ctx_from.device_type == kDLCPU) ctx = ctx_to; - id queue = GetCommandQueue(ctx); + Device dev = dev_from; + if (dev_from.device_type == kDLCPU) dev = dev_to; + id queue = GetCommandQueue(dev); id cb = [queue commandBuffer]; - int from_dev_type = static_cast(ctx_from.device_type); - int to_dev_type = static_cast(ctx_to.device_type); + int from_dev_type = static_cast(dev_from.device_type); + int to_dev_type = static_cast(dev_to.device_type); if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) { - ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy."; + ICHECK_EQ(dev_from.device_id, dev_to.device_id) << "Metal disallow cross device copy."; id encoder = [cb blitCommandEncoder]; [encoder copyFromBuffer:(id)(from) sourceOffset:from_offset @@ -210,7 +209,7 @@ int GetWarpSize(id dev) { // copy to a local buffer before get into global buffer. id from_buf = (id)(from); if (from_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size); + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(dev_from, size); id encoder = [cb blitCommandEncoder]; [encoder copyFromBuffer:from_buf sourceOffset:from_offset @@ -228,7 +227,7 @@ int GetWarpSize(id dev) { } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) { id to_buf = (id)(to); if (to_buf.storageMode != MTLStorageModeShared) { - id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size); + id temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(dev_to, size); memcpy([temp contents], static_cast(from) + from_offset, size); id encoder = [cb blitCommandEncoder]; [encoder copyFromBuffer:temp @@ -250,23 +249,23 @@ int GetWarpSize(id dev) { } } -void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { +void MetalWorkspace::StreamSync(Device dev, TVMStreamHandle stream) { @autoreleasepool { ICHECK(stream == nullptr); // commit an empty command buffer and wait until it completes. - id queue = GetCommandQueue(ctx); + id queue = GetCommandQueue(dev); id cb = [queue commandBuffer]; [cb commit]; [cb waitUntilCompleted]; } } -void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { - return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); +void* MetalWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { + return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); } -void MetalWorkspace::FreeWorkspace(TVMContext ctx, void* data) { - MetalThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); +void MetalWorkspace::FreeWorkspace(Device dev, void* data) { + MetalThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); } MetalThreadEntry::~MetalThreadEntry() { @@ -278,19 +277,19 @@ int GetWarpSize(id dev) { } } -id MetalThreadEntry::GetTempBuffer(TVMContext ctx, size_t size) { - if (temp_buffer_.size() <= static_cast(ctx.device_id)) { - temp_buffer_.resize(ctx.device_id + 1, nil); +id MetalThreadEntry::GetTempBuffer(Device dev, size_t size) { + if (temp_buffer_.size() <= static_cast(dev.device_id)) { + temp_buffer_.resize(dev.device_id + 1, nil); } - if (temp_buffer_[ctx.device_id] == nil || temp_buffer_[ctx.device_id].length < size) { - id dev = MetalWorkspace::Global()->GetDevice(ctx); - if (temp_buffer_[ctx.device_id] != nil) { - [temp_buffer_[ctx.device_id] setPurgeableState:MTLPurgeableStateEmpty]; - [temp_buffer_[ctx.device_id] release]; + if (temp_buffer_[dev.device_id] == nil || temp_buffer_[dev.device_id].length < size) { + id mtl_dev = MetalWorkspace::Global()->GetDevice(dev); + if (temp_buffer_[dev.device_id] != nil) { + [temp_buffer_[dev.device_id] setPurgeableState:MTLPurgeableStateEmpty]; + [temp_buffer_[dev.device_id] release]; } - temp_buffer_[ctx.device_id] = [dev newBufferWithLength:size options:MTLStorageModeShared]; + temp_buffer_[dev.device_id] = [mtl_dev newBufferWithLength:size options:MTLStorageModeShared]; } - return temp_buffer_[ctx.device_id]; + return temp_buffer_[dev.device_id]; } typedef dmlc::ThreadLocalStore MetalThreadStore; diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm index c7e2d8b7b4bc7..a8b01815bf688 100644 --- a/src/runtime/metal/metal_module.mm +++ b/src/runtime/metal/metal_module.mm @@ -177,14 +177,14 @@ void Init(MetalModuleNode* m, ObjectPtr sptr, const std::string& func_na std::fill(scache_.begin(), scache_.end(), (id)nil); thread_axis_cfg_.Init(num_buffer_args + num_pack_args, thread_axis_tags); metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); - int dev_id = t->context.device_id; + int dev_id = t->device.device_id; scache_[dev_id] = m->GetPipelineState(dev_id, func_name); } // invoke the function with void arguments void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const { @autoreleasepool { metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); - int device_id = t->context.device_id; + int device_id = t->device.device_id; if (scache_[device_id] == nil) { scache_[device_id] = m_->GetPipelineState(device_id, func_name_); } @@ -192,7 +192,7 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2); auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup; CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup); - id queue = w_->GetCommandQueue(t->context); + id queue = w_->GetCommandQueue(t->device); id cb = [queue commandBuffer]; id encoder = [cb computeCommandEncoder]; [encoder setComputePipelineState:scache_[device_id]]; diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.cc b/src/runtime/micro/standalone/utvm_graph_runtime.cc index e19ee347a45eb..897a4f4b5c0ce 100644 --- a/src/runtime/micro/standalone/utvm_graph_runtime.cc +++ b/src/runtime/micro/standalone/utvm_graph_runtime.cc @@ -122,7 +122,7 @@ void ParseArgNodes(const picojson::array& jinput_nodes, DynArray* inpu NDArray::~NDArray() {} -NDArray NDArray::Empty(const DynArray& shape, DLDataType dtype, DLContext ctx) { +NDArray NDArray::Empty(const DynArray& shape, DLDataType dtype, DLDevice dev) { NDArray r; int64_t nbytes = (dtype.bits * dtype.lanes + 7) / 8; for (const auto& s : shape) { @@ -130,16 +130,16 @@ NDArray NDArray::Empty(const DynArray& shape, DLDataType dtype, DLConte } r.storage_ = std::shared_ptr( - TVMBackendAllocWorkspace(static_cast(ctx.device_type), static_cast(ctx.device_id), + TVMBackendAllocWorkspace(static_cast(dev.device_type), static_cast(dev.device_id), nbytes, dtype.code, dtype.bits), [=](void* ptr) { if (ptr) { - TVMBackendFreeWorkspace(ctx.device_type, ctx.device_id, ptr); + TVMBackendFreeWorkspace(dev.device_type, dev.device_id, ptr); } }); r.shape_ = shape; r.dtype_ = dtype; - r.ctx_ = ctx; + r.device_ = dev; return r; } @@ -148,7 +148,7 @@ NDArray NDArray::CreateView(const DynArray& shape, DLDataType dtype) { r.storage_ = storage_; r.shape_ = shape; r.dtype_ = dtype; - r.ctx_ = ctx_; + r.device_ = device_; return r; } @@ -156,7 +156,7 @@ DLTensor NDArray::ToDLTensor() { DLTensor r; r.data = storage_.get(); assert(r.data != nullptr); - r.ctx = ctx_; + r.device = device_; r.ndim = shape_.size(); r.dtype = dtype_; r.shape = shape_.data(); @@ -279,7 +279,7 @@ void MicroGraphRuntime::SetupStorage() { for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; // Use the fallback device if no device index is available. - int device_type = static_cast(ctx_.device_type); + int device_type = static_cast(device_.device_type); size_t size = 1; for (int64_t sz : attrs_.shape[i]) { size *= static_cast(sz); @@ -306,7 +306,7 @@ void MicroGraphRuntime::SetupStorage() { const auto& pit = pool_entry[i]; DynArray shape(1); shape[0] = static_cast(pit.size + 3) / 4; - storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx_); + storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, device_); } // Assign the pooled entries. A unified memory pool is used to simplify diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.h b/src/runtime/micro/standalone/utvm_graph_runtime.h index b479193861bba..3a2519c98fd4e 100644 --- a/src/runtime/micro/standalone/utvm_graph_runtime.h +++ b/src/runtime/micro/standalone/utvm_graph_runtime.h @@ -93,8 +93,8 @@ struct Node { // Minimal NDArray abstraction class NDArray { public: - // initialize NDArray with shape/dtype/ctx - static NDArray Empty(const DynArray& shape, DLDataType dtype, DLContext ctx); + // initialize NDArray with shape/dtype/device + static NDArray Empty(const DynArray& shape, DLDataType dtype, DLDevice dev); // create a view of the NDArray storage, with the given shape/dtype NDArray CreateView(const DynArray& shape, DLDataType dtype); // Copy into the internal storage. @@ -112,8 +112,8 @@ class NDArray { DynArray shape_; // tensor dtype DLDataType dtype_; - // tensor context - DLContext ctx_; + // tensor device + DLDevice device_; }; // Minimal GraphRuntime implementation @@ -150,8 +150,8 @@ class MicroGraphRuntime { DynArray outputs_; // Additional graph attributes GraphAttr attrs_; - // Execution context - DLContext ctx_{kDLCPU, 0}; + // Execution device + DLDevice device_{kDLCPU, 0}; // Common storage pool DynArray storage_pool_; diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h index 3b9772f2fb607..732e1e49d4a40 100644 --- a/src/runtime/minrpc/minrpc_server.h +++ b/src/runtime/minrpc/minrpc_server.h @@ -173,7 +173,7 @@ class MinRPCServer { uint64_t data_handle; this->Read(&data_handle); arr->data = reinterpret_cast(data_handle); - this->Read(&(arr->ctx)); + this->Read(&(arr->device)); this->Read(&(arr->ndim)); this->Read(&(arr->dtype)); arr->shape = this->ArenaAlloc(arr->ndim); @@ -186,13 +186,13 @@ class MinRPCServer { uint8_t* data_ptr; int call_ecode = 0; - if (arr->ctx.device_type == kDLCPU) { + if (arr->device.device_type == kDLCPU) { data_ptr = reinterpret_cast(data_handle) + arr->byte_offset; } else { data_ptr = this->ArenaAlloc(num_bytes); DLTensor temp; temp.data = reinterpret_cast(data_ptr); - temp.ctx = arr->ctx; + temp.device = arr->device; temp.ndim = arr->ndim; temp.dtype = arr->dtype; temp.shape = arr->shape; @@ -201,7 +201,7 @@ class MinRPCServer { call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr); } } @@ -224,7 +224,7 @@ class MinRPCServer { uint64_t data_handle; this->Read(&data_handle); arr->data = reinterpret_cast(data_handle); - this->Read(&(arr->ctx)); + this->Read(&(arr->device)); this->Read(&(arr->ndim)); this->Read(&(arr->dtype)); arr->shape = this->ArenaAlloc(arr->ndim); @@ -235,7 +235,7 @@ class MinRPCServer { this->Read(&num_bytes); int call_ecode = 0; - if (arr->ctx.device_type == kDLCPU) { + if (arr->device.device_type == kDLCPU) { uint8_t* dptr = reinterpret_cast(data_handle) + arr->byte_offset; this->ReadArray(dptr, num_bytes); } else { @@ -243,7 +243,7 @@ class MinRPCServer { this->ReadArray(temp_data, num_bytes); DLTensor temp; temp.data = temp_data; - temp.ctx = DLContext{kDLCPU, 0}; + temp.device = DLDevice{kDLCPU, 0}; temp.ndim = arr->ndim; temp.dtype = arr->dtype; temp.shape = arr->shape; @@ -252,7 +252,7 @@ class MinRPCServer { call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr); } } @@ -390,18 +390,18 @@ class MinRPCServer { void SyscallDevAllocData(TVMValue* values, int* tcodes, int num_args) { MINRPC_CHECK(num_args == 4); - MINRPC_CHECK(tcodes[0] == kTVMContext); + MINRPC_CHECK(tcodes[0] == kDLDevice); MINRPC_CHECK(tcodes[1] == kDLInt); MINRPC_CHECK(tcodes[2] == kDLInt); MINRPC_CHECK(tcodes[3] == kTVMDataType); - TVMContext ctx = values[0].v_ctx; + DLDevice dev = values[0].v_device; int64_t nbytes = values[1].v_int64; int64_t alignment = values[2].v_int64; DLDataType type_hint = values[3].v_type; void* handle; - int call_ecode = TVMDeviceAllocDataSpace(ctx, nbytes, alignment, type_hint, &handle); + int call_ecode = TVMDeviceAllocDataSpace(dev, nbytes, alignment, type_hint, &handle); if (call_ecode == 0) { this->ReturnHandle(handle); @@ -418,8 +418,8 @@ class MinRPCServer { DLTensor* arr = reinterpret_cast(values[0].v_handle); const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str); void* handle; - int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype, - mem_scope, &handle); + int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->device, arr->ndim, arr->shape, + arr->dtype, mem_scope, &handle); if (call_ecode == 0) { this->ReturnHandle(handle); } else { @@ -429,13 +429,13 @@ class MinRPCServer { void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) { MINRPC_CHECK(num_args == 2); - MINRPC_CHECK(tcodes[0] == kTVMContext); + MINRPC_CHECK(tcodes[0] == kDLDevice); MINRPC_CHECK(tcodes[1] == kTVMOpaqueHandle); - TVMContext ctx = values[0].v_ctx; + DLDevice dev = values[0].v_device; void* handle = values[1].v_handle; - int call_ecode = TVMDeviceFreeDataSpace(ctx, handle); + int call_ecode = TVMDeviceFreeDataSpace(dev, handle); if (call_ecode == 0) { this->ReturnVoid(); @@ -446,13 +446,13 @@ class MinRPCServer { void SyscallDevStreamSync(TVMValue* values, int* tcodes, int num_args) { MINRPC_CHECK(num_args == 2); - MINRPC_CHECK(tcodes[0] == kTVMContext); + MINRPC_CHECK(tcodes[0] == kDLDevice); MINRPC_CHECK(tcodes[1] == kTVMOpaqueHandle); - TVMContext ctx = values[0].v_ctx; + DLDevice dev = values[0].v_device; void* handle = values[1].v_handle; - int call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, handle); + int call_ecode = TVMSynchronize(dev.device_type, dev.device_id, handle); if (call_ecode == 0) { this->ReturnVoid(); @@ -511,7 +511,7 @@ class MinRPCServer { size_t npages = ((min_size + kPageSize - 1) / kPageSize); void* data; - if (TVMDeviceAllocDataSpace(DLContext{kDLCPU, 0}, npages * kPageSize, kPageAlign, + if (TVMDeviceAllocDataSpace(DLDevice{kDLCPU, 0}, npages * kPageSize, kPageAlign, DLDataType{kDLInt, 1, 1}, &data) != 0) { io_->Exit(static_cast(RPCServerStatus::kAllocError)); } @@ -523,7 +523,7 @@ class MinRPCServer { } void deallocate(ArenaPageHeader* page) { - if (TVMDeviceFreeDataSpace(DLContext{kDLCPU, 0}, page) != 0) { + if (TVMDeviceFreeDataSpace(DLDevice{kDLCPU, 0}, page) != 0) { io_->Exit(static_cast(RPCServerStatus::kAllocError)); } } diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h index 07d13a7ff67b6..e42508a739596 100644 --- a/src/runtime/minrpc/rpc_reference.h +++ b/src/runtime/minrpc/rpc_reference.h @@ -223,15 +223,15 @@ struct RPCReference { template static void SendDLTensor(TChannelPtr channel, DLTensor* arr) { - TVMContext ctx; + DLDevice dev; uint64_t data; // When we return NDArray, we directly return // the space and the context // The client will be further wrapping - ctx = arr->ctx; + dev = arr->device; data = reinterpret_cast(arr->data); channel->Write(data); - channel->Write(ctx); + channel->Write(dev); channel->Write(arr->ndim); channel->Write(arr->dtype); channel->WriteArray(arr->shape, arr->ndim); @@ -249,7 +249,7 @@ struct RPCReference { DLTensor* arr = channel->template ArenaAlloc(1); DLTensor& tensor = *arr; tensor.data = reinterpret_cast(handle); - channel->Read(&(tensor.ctx)); + channel->Read(&(tensor.device)); channel->Read(&(tensor.ndim)); channel->Read(&(tensor.dtype)); tensor.shape = channel->template ArenaAlloc(tensor.ndim); @@ -306,8 +306,8 @@ struct RPCReference { channel->template Write(padding); break; } - case kTVMContext: { - channel->Write(value.v_ctx); + case kDLDevice: { + channel->Write(value.v_device); break; } @@ -408,8 +408,8 @@ struct RPCReference { channel->template Read(&padding); break; } - case kTVMContext: { - channel->Read(&(value.v_ctx)); + case kDLDevice: { + channel->Read(&(value.v_device)); break; } case kTVMPackedFuncHandle: diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index d46f0868a2ead..4b52a7d37ce2f 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -66,15 +66,15 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) { DLTensor from; from.data = const_cast(data); - from.ctx = DLContext{kDLCPU, 0}; + from.device = Device{kDLCPU, 0}; from.ndim = handle->ndim; from.dtype = handle->dtype; from.shape = handle->shape; from.strides = nullptr; from.byte_offset = 0; - DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr); + DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr); // Synchronize in case data become unavailable later. - DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); + DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr); } void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) { @@ -84,16 +84,16 @@ void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) { DLTensor to; to.data = const_cast(data); - to.ctx = DLContext{kDLCPU, 0}; + to.device = Device{kDLCPU, 0}; to.ndim = handle->ndim; to.dtype = handle->dtype; to.shape = handle->shape; to.strides = nullptr; to.byte_offset = 0; - DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast(handle), &to, nullptr); + DeviceAPI::Get(handle->device)->CopyDataFromTo(const_cast(handle), &to, nullptr); // Synchronize in case data become unavailable later. - DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); + DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr); } struct NDArray::Internal { @@ -103,8 +103,8 @@ struct NDArray::Internal { if (ptr->manager_ctx != nullptr) { static_cast(ptr->manager_ctx)->DecRef(); } else if (ptr->dl_tensor.data != nullptr) { - tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx) - ->FreeDataSpace(ptr->dl_tensor.ctx, ptr->dl_tensor.data); + tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device) + ->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data); } delete ptr; } @@ -123,7 +123,7 @@ struct NDArray::Internal { } // Local create function which allocates tensor metadata // but does not allocate space for the data. - static NDArray Create(std::vector shape, DLDataType dtype, DLContext ctx) { + static NDArray Create(std::vector shape, DLDataType dtype, Device dev) { VerifyDataType(dtype); // critical zone: construct header @@ -138,8 +138,8 @@ struct NDArray::Internal { data->dl_tensor.ndim = static_cast(data->shape_.size()); // setup dtype data->dl_tensor.dtype = dtype; - // setup ctx - data->dl_tensor.ctx = ctx; + // setup device + data->dl_tensor.device = dev; return ret; } // Implementation of API function @@ -175,7 +175,7 @@ struct NDArray::Internal { NDArray NDArray::CreateView(std::vector shape, DLDataType dtype) { ICHECK(data_ != nullptr); ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor"; - NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.ctx); + NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.device); ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset; size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor); size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor); @@ -190,11 +190,12 @@ NDArray NDArray::CreateView(std::vector shape, DLDataType dtype) { DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); } -NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx, +NDArray NDArray::Empty(std::vector shape, DLDataType dtype, Device dev, Optional mem_scope) { - NDArray ret = Internal::Create(shape, dtype, ctx); - ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace( - ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope); + NDArray ret = Internal::Create(shape, dtype, dev); + ret.get_mutable()->dl_tensor.data = + DeviceAPI::Get(ret->device) + ->AllocDataSpace(ret->device, shape.size(), shape.data(), ret->dtype, mem_scope); return ret; } @@ -229,16 +230,16 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str size_t to_size = GetDataSize(*to); ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match"; - ICHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU || - to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned || - to->ctx.device_type == kDLCPUPinned) - << "Can not copy across different ctx types directly"; + ICHECK(from->device.device_type == to->device.device_type || from->device.device_type == kDLCPU || + to->device.device_type == kDLCPU || from->device.device_type == kDLCPUPinned || + to->device.device_type == kDLCPUPinned) + << "Can not copy across different device types directly"; - // Use the context that is *not* a cpu context to get the correct device + // Use the device that is *not* a cpu device to get the correct device // api manager. - TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx; + Device dev = from->device.device_type != kDLCPU ? from->device : to->device; - DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast(from), to, stream); + DeviceAPI::Get(dev)->CopyDataFromTo(const_cast(from), to, stream); } std::vector NDArray::Shape() const { return get_mutable()->shape_; } @@ -270,10 +271,10 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_ dtype.code = static_cast(dtype_code); dtype.bits = static_cast(dtype_bits); dtype.lanes = static_cast(dtype_lanes); - DLContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - auto ndarray = NDArray::Empty(std::vector(shape, shape + ndim), dtype, ctx); + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; + auto ndarray = NDArray::Empty(std::vector(shape, shape + ndim), dtype, dev); *out = NDArray::Internal::MoveToFFIHandle(ndarray); API_END(); @@ -284,9 +285,9 @@ TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, int ndim = args[1]; std::vector shape(shape_ptr, shape_ptr + ndim); DataType dtype = args[2]; - TVMContext ctx = args[3]; + Device dev = args[3]; Optional mem_scope = args[4]; - auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope); + auto ndarray = NDArray::Empty(shape, dtype, dev, mem_scope); *ret = ndarray; }); diff --git a/src/runtime/opencl/aocl/aocl_common.h b/src/runtime/opencl/aocl/aocl_common.h index ae1a4a8cc31f1..448f5d0ac6d7b 100644 --- a/src/runtime/opencl/aocl/aocl_common.h +++ b/src/runtime/opencl/aocl/aocl_common.h @@ -39,7 +39,7 @@ class AOCLWorkspace final : public OpenCLWorkspace { public: // override OpenCL device API void Init() final; - bool IsOpenCLDevice(TVMContext ctx) final; + bool IsOpenCLDevice(Device dev) final; OpenCLThreadEntry* GetThreadEntry() final; // get the global workspace static OpenCLWorkspace* Global(); diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc index 5432507087ca3..e407837f6a7ff 100644 --- a/src/runtime/opencl/aocl/aocl_device_api.cc +++ b/src/runtime/opencl/aocl/aocl_device_api.cc @@ -40,8 +40,8 @@ void AOCLWorkspace::Init() { OpenCLWorkspace::Init("aocl", "accelerator", "Intel(R) FPGA SDK for OpenCL(TM)"); } -bool AOCLWorkspace::IsOpenCLDevice(TVMContext ctx) { - return ctx.device_type == static_cast(kDLAOCL); +bool AOCLWorkspace::IsOpenCLDevice(Device dev) { + return dev.device_type == static_cast(kDLAOCL); } typedef dmlc::ThreadLocalStore AOCLThreadStore; diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 3fca368c758b1..b4377119e4c7e 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -27,6 +27,7 @@ #include #include #include +#include #include /* There are many OpenCL platforms that do not yet support OpenCL 2.0, @@ -218,23 +219,23 @@ class OpenCLWorkspace : public DeviceAPI { const std::string& platform_name = ""); virtual void Init() { Init("opencl", "gpu"); } // Check whether the context is OpenCL or not. - virtual bool IsOpenCLDevice(TVMContext ctx) { return ctx.device_type == kDLOpenCL; } - // get the queue of the context - cl_command_queue GetQueue(TVMContext ctx) { - ICHECK(IsOpenCLDevice(ctx)); + virtual bool IsOpenCLDevice(Device dev) { return dev.device_type == kDLOpenCL; } + // get the queue of the device + cl_command_queue GetQueue(Device dev) { + ICHECK(IsOpenCLDevice(dev)); this->Init(); - ICHECK(ctx.device_id >= 0 && static_cast(ctx.device_id) < queues.size()) - << "Invalid OpenCL device_id=" << ctx.device_id; - return queues[ctx.device_id]; + ICHECK(dev.device_id >= 0 && static_cast(dev.device_id) < queues.size()) + << "Invalid OpenCL device_id=" << dev.device_id; + return queues[dev.device_id]; } // override device API - void SetDevice(TVMContext ctx) final; - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final; - void FreeDataSpace(TVMContext ctx, void* ptr) final; - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; - void FreeWorkspace(TVMContext ctx, void* data) final; + void SetDevice(Device dev) final; + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final; + void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final; + void FreeDataSpace(Device dev, void* ptr) final; + void StreamSync(Device dev, TVMStreamHandle stream) final; + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; + void FreeWorkspace(Device dev, void* data) final; /*! * \brief Get the thread local ThreadEntry @@ -246,7 +247,7 @@ class OpenCLWorkspace : public DeviceAPI { protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final; }; @@ -260,16 +261,17 @@ class OpenCLThreadEntry { // timestamp used to recognize stale kernel size_t version{0}; }; - /*! \brief The current context */ - TVMContext context; + /*! \brief The current device */ + Device device; /*! \brief The thread-local kernel table */ std::vector kernel_table; /*! \brief workspace pool */ WorkspacePool pool; // constructor - OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device) : pool(device_type, device) { - context.device_id = 0; - context.device_type = device_type; + OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api) + : pool(device_type, device_api) { + device.device_id = 0; + device.device_type = device_type; } OpenCLThreadEntry() : OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {} diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index a3ec21e28f1d5..8f49279243c28 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -36,13 +36,11 @@ OpenCLWorkspace* OpenCLWorkspace::Global() { return inst; } -void OpenCLWorkspace::SetDevice(TVMContext ctx) { - GetThreadEntry()->context.device_id = ctx.device_id; -} +void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; } -void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { +void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) { this->Init(); - size_t index = static_cast(ctx.device_id); + size_t index = static_cast(dev.device_id); if (kind == kExist) { *rv = static_cast(index < devices.size()); return; @@ -116,7 +114,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* } } -void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, +void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) { this->Init(); ICHECK(context != nullptr) << "No OpenCL device"; @@ -126,53 +124,52 @@ void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignm return mptr; } -void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) { +void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { // We have to make sure that the memory object is not in the command queue // for some OpenCL platforms. - OPENCL_CALL(clFinish(this->GetQueue(ctx))); + OPENCL_CALL(clFinish(this->GetQueue(dev))); cl_mem mptr = static_cast(ptr); OPENCL_CALL(clReleaseMemObject(mptr)); } void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to, - size_t to_offset, size_t size, TVMContext ctx_from, - TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) { + size_t to_offset, size_t size, Device dev_from, Device dev_to, + DLDataType type_hint, TVMStreamHandle stream) { this->Init(); ICHECK(stream == nullptr); - if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) { - OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to), + if (IsOpenCLDevice(dev_from) && IsOpenCLDevice(dev_to)) { + OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(dev_to), static_cast((void*)from), // NOLINT(*) static_cast(to), from_offset, to_offset, size, 0, nullptr, nullptr)); - } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) { - OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from), + } else if (IsOpenCLDevice(dev_from) && dev_to.device_type == kDLCPU) { + OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(dev_from), static_cast((void*)from), // NOLINT(*) CL_FALSE, from_offset, size, static_cast(to) + to_offset, 0, nullptr, nullptr)); - OPENCL_CALL(clFinish(this->GetQueue(ctx_from))); - } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) { - OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(ctx_to), static_cast(to), CL_FALSE, + OPENCL_CALL(clFinish(this->GetQueue(dev_from))); + } else if (dev_from.device_type == kDLCPU && IsOpenCLDevice(dev_to)) { + OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(dev_to), static_cast(to), CL_FALSE, to_offset, size, static_cast(from) + from_offset, 0, nullptr, nullptr)); - OPENCL_CALL(clFinish(this->GetQueue(ctx_to))); + OPENCL_CALL(clFinish(this->GetQueue(dev_to))); } else { LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL"; } } -void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) { +void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) { ICHECK(stream == nullptr); - OPENCL_CALL(clFinish(this->GetQueue(ctx))); + OPENCL_CALL(clFinish(this->GetQueue(dev))); } -void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { - return GetThreadEntry()->pool.AllocWorkspace(ctx, size); +void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { + return GetThreadEntry()->pool.AllocWorkspace(dev, size); } -void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) { - GetThreadEntry()->pool.FreeWorkspace(ctx, data); +void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) { + GetThreadEntry()->pool.FreeWorkspace(dev, data); } typedef dmlc::ThreadLocalStore OpenCLThreadStore; diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index a4c61e47b376d..8c22c3c8cb235 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -65,7 +65,7 @@ class OpenCLWrappedFunc { for (cl_uint i = 0; i < arg_size_.size(); ++i) { OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i])); } - cl_command_queue queue = w_->GetQueue(t->context); + cl_command_queue queue = w_->GetQueue(t->device); ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); cl_uint work_dim = static_cast(thread_axis_cfg_.work_dim()); for (cl_uint i = 0; i < work_dim; ++i) { @@ -186,7 +186,7 @@ void OpenCLModuleNode::Init() { cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e) { std::lock_guard lock(build_lock_); - int device_id = t->context.device_id; + int device_id = t->device.device_id; if (!device_built_flag_[device_id]) { // create program if (fmt_ == "cl") { diff --git a/src/runtime/opencl/sdaccel/sdaccel_common.h b/src/runtime/opencl/sdaccel/sdaccel_common.h index feeab0bc89ce1..80bc770cc0a4b 100644 --- a/src/runtime/opencl/sdaccel/sdaccel_common.h +++ b/src/runtime/opencl/sdaccel/sdaccel_common.h @@ -39,7 +39,7 @@ class SDAccelWorkspace final : public OpenCLWorkspace { public: // override OpenCL device API void Init() final; - bool IsOpenCLDevice(TVMContext ctx) final; + bool IsOpenCLDevice(Device dev) final; OpenCLThreadEntry* GetThreadEntry() final; // get the global workspace static OpenCLWorkspace* Global(); diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc index ebe387b1ddb3c..7d4b673324a04 100644 --- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc +++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc @@ -38,8 +38,8 @@ OpenCLWorkspace* SDAccelWorkspace::Global() { void SDAccelWorkspace::Init() { OpenCLWorkspace::Init("sdaccel", "accelerator", "Xilinx"); } -bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) { - return ctx.device_type == static_cast(kDLSDAccel); +bool SDAccelWorkspace::IsOpenCLDevice(Device dev) { + return dev.device_type == static_cast(kDLSDAccel); } typedef dmlc::ThreadLocalStore SDAccelThreadStore; diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 3d204166986d9..6cf1034a3837b 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -34,30 +34,30 @@ namespace runtime { class DefaultTimerNode : public TimerNode { public: virtual void Start() { - TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr); + TVMSynchronize(device_.device_type, device_.device_id, nullptr); start_ = std::chrono::high_resolution_clock::now(); } virtual void Stop() { - TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr); + TVMSynchronize(device_.device_type, device_.device_id, nullptr); duration_ = std::chrono::high_resolution_clock::now() - start_; } virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); } virtual ~DefaultTimerNode() {} - explicit DefaultTimerNode(TVMContext ctx) : ctx_(ctx) {} + explicit DefaultTimerNode(Device dev) : device_(dev) {} static constexpr const char* _type_key = "DefaultTimerNode"; TVM_DECLARE_FINAL_OBJECT_INFO(DefaultTimerNode, TimerNode); private: std::chrono::high_resolution_clock::time_point start_; std::chrono::duration duration_; - TVMContext ctx_; + Device device_; }; TVM_REGISTER_OBJECT_TYPE(DefaultTimerNode); TVM_REGISTER_OBJECT_TYPE(TimerNode); -Timer DefaultTimer(TVMContext ctx) { return Timer(make_object(ctx)); } +Timer DefaultTimer(Device dev) { return Timer(make_object(dev)); } class CPUTimerNode : public TimerNode { public: @@ -75,18 +75,18 @@ class CPUTimerNode : public TimerNode { }; TVM_REGISTER_OBJECT_TYPE(CPUTimerNode); -TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) { +TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) { return Timer(make_object()); }); -Timer Timer::Start(TVMContext ctx) { - auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(ctx.device_type)); +Timer Timer::Start(Device dev) { + auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(dev.device_type)); if (f == nullptr) { - Timer t = DefaultTimer(ctx); + Timer t = DefaultTimer(dev); t->Start(); return t; } else { - Timer t = f->operator()(ctx); + Timer t = f->operator()(dev); t->Start(); return t; } diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 5d03374a4571e..70c614551884c 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -36,15 +36,15 @@ namespace runtime { class ROCMDeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final { ROCM_CALL(hipSetDevice(ctx.device_id)); } - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + void SetDevice(Device dev) final { ROCM_CALL(hipSetDevice(dev.device_id)); } + void GetAttr(Device device, DeviceAttrKind kind, TVMRetValue* rv) final { int value = 0; switch (kind) { case kExist: { if (hsa_init() == HSA_STATUS_SUCCESS) { int dev; ROCM_CALL(hipGetDeviceCount(&dev)); - value = dev > ctx.device_id ? 1 : 0; + value = dev > device.device_id ? 1 : 0; hsa_shut_down(); } else { value = 0; @@ -53,50 +53,53 @@ class ROCMDeviceAPI final : public DeviceAPI { } case kMaxThreadsPerBlock: { ROCM_CALL( - hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id)); + hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, device.device_id)); break; } case kWarpSize: { - ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id)); + ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, device.device_id)); break; } case kMaxSharedMemoryPerBlock: { ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, - ctx.device_id)); + device.device_id)); break; } case kComputeVersion: { std::ostringstream os; - ROCM_CALL( - hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id)); + ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor, + device.device_id)); os << value << "."; - ROCM_CALL( - hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id)); + ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor, + device.device_id)); os << value; *rv = os.str(); return; } case kDeviceName: { std::string name(256, 0); - ROCM_CALL(hipDeviceGetName(&name[0], name.size(), ctx.device_id)); + ROCM_CALL(hipDeviceGetName(&name[0], name.size(), device.device_id)); name.resize(strlen(name.c_str())); *rv = std::move(name); return; } case kMaxClockRate: { - ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, ctx.device_id)); + ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, device.device_id)); break; } case kMultiProcessorCount: { ROCM_CALL( - hipDeviceGetAttribute(&value, hipDeviceAttributeMultiprocessorCount, ctx.device_id)); + hipDeviceGetAttribute(&value, hipDeviceAttributeMultiprocessorCount, device.device_id)); break; } case kMaxThreadDimensions: { int dims[3]; - ROCM_CALL(hipDeviceGetAttribute(&dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id)); - ROCM_CALL(hipDeviceGetAttribute(&dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id)); - ROCM_CALL(hipDeviceGetAttribute(&dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id)); + ROCM_CALL( + hipDeviceGetAttribute(&dims[0], hipDeviceAttributeMaxBlockDimX, device.device_id)); + ROCM_CALL( + hipDeviceGetAttribute(&dims[1], hipDeviceAttributeMaxBlockDimY, device.device_id)); + ROCM_CALL( + hipDeviceGetAttribute(&dims[2], hipDeviceAttributeMaxBlockDimZ, device.device_id)); std::stringstream ss; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; @@ -104,12 +107,12 @@ class ROCMDeviceAPI final : public DeviceAPI { return; } case kMaxRegistersPerBlock: - ROCM_CALL( - hipDeviceGetAttribute(&value, hipDeviceAttributeMaxRegistersPerBlock, ctx.device_id)); + ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeMaxRegistersPerBlock, + device.device_id)); break; case kGcnArch: { hipDeviceProp_t prop; - ROCM_CALL(hipGetDeviceProperties(&prop, ctx.device_id)); + ROCM_CALL(hipGetDeviceProperties(&prop, device.device_id)); *rv = prop.gcnArch; return; } @@ -120,59 +123,58 @@ class ROCMDeviceAPI final : public DeviceAPI { } *rv = value; } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { - ROCM_CALL(hipSetDevice(ctx.device_id)); + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { + ROCM_CALL(hipSetDevice(dev.device_id)); ICHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes"; void* ret; ROCM_CALL(hipMalloc(&ret, nbytes)); return ret; } - void FreeDataSpace(TVMContext ctx, void* ptr) final { - ROCM_CALL(hipSetDevice(ctx.device_id)); + void FreeDataSpace(Device dev, void* ptr) final { + ROCM_CALL(hipSetDevice(dev.device_id)); ROCM_CALL(hipFree(ptr)); } void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { hipStream_t hip_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; - if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLROCM) { - ROCM_CALL(hipSetDevice(ctx_from.device_id)); - if (ctx_from.device_id == ctx_to.device_id) { + if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLROCM) { + ROCM_CALL(hipSetDevice(dev_from.device_id)); + if (dev_from.device_id == dev_to.device_id) { GPUCopy(from, to, size, hipMemcpyDeviceToDevice, hip_stream); } else { - hipMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size, hip_stream); + hipMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, hip_stream); } - } else if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLCPU) { - ROCM_CALL(hipSetDevice(ctx_from.device_id)); + } else if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLCPU) { + ROCM_CALL(hipSetDevice(dev_from.device_id)); GPUCopy(from, to, size, hipMemcpyDeviceToHost, hip_stream); - } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLROCM) { - ROCM_CALL(hipSetDevice(ctx_to.device_id)); + } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLROCM) { + ROCM_CALL(hipSetDevice(dev_to.device_id)); GPUCopy(from, to, size, hipMemcpyHostToDevice, hip_stream); } else { LOG(FATAL) << "expect copy from/to GPU or between GPU"; } } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - ROCM_CALL(hipSetDevice(ctx.device_id)); + void StreamSync(Device dev, TVMStreamHandle stream) final { + ROCM_CALL(hipSetDevice(dev.device_id)); ROCM_CALL(hipStreamSynchronize(static_cast(stream))); } - void SetStream(TVMContext ctx, TVMStreamHandle stream) final { + void SetStream(Device dev, TVMStreamHandle stream) final { ROCMThreadEntry::ThreadLocal()->stream = static_cast(stream); } - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { - return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { + return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); } - void FreeWorkspace(TVMContext ctx, void* data) final { - ROCMThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); + void FreeWorkspace(Device dev, void* data) final { + ROCMThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); } static ROCMDeviceAPI* Global() { @@ -233,7 +235,7 @@ class ROCMTimerNode : public TimerNode { TVM_REGISTER_OBJECT_TYPE(ROCMTimerNode); -TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](TVMContext ctx) { +TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](Device dev) { return Timer(make_object()); }); diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index cdeeb368f5a27..1d6fb85d94955 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -33,45 +33,44 @@ namespace runtime { class RPCDeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final { - auto remote_ctx = RemoveRPCSessionMask(ctx); - GetSess(ctx)->GetDeviceAPI(remote_ctx)->SetDevice(remote_ctx); + void SetDevice(Device dev) final { + auto remote_dev = RemoveRPCSessionMask(dev); + GetSess(dev)->GetDeviceAPI(remote_dev)->SetDevice(remote_dev); } - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { - auto remote_ctx = RemoveRPCSessionMask(ctx); - GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv); + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { + auto remote_dev = RemoveRPCSessionMask(dev); + GetSess(dev)->GetDeviceAPI(remote_dev)->GetAttr(remote_dev, kind, rv); } - void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) final { - auto sess = GetSess(ctx); - auto remote_ctx = RemoveRPCSessionMask(ctx); + auto sess = GetSess(dev); + auto remote_dev = RemoveRPCSessionMask(dev); void* data = - sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope); + sess->GetDeviceAPI(remote_dev)->AllocDataSpace(remote_dev, ndim, shape, dtype, mem_scope); RemoteSpace* space = new RemoteSpace(); space->data = data; space->sess = std::move(sess); return space; } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { - auto sess = GetSess(ctx); - auto remote_ctx = RemoveRPCSessionMask(ctx); + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { + auto sess = GetSess(dev); + auto remote_dev = RemoveRPCSessionMask(dev); void* data = - sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, nbytes, alignment, type_hint); + sess->GetDeviceAPI(remote_dev)->AllocDataSpace(remote_dev, nbytes, alignment, type_hint); RemoteSpace* space = new RemoteSpace(); space->data = data; space->sess = std::move(sess); return space; } - void FreeDataSpace(TVMContext ctx, void* ptr) final { + void FreeDataSpace(Device dev, void* ptr) final { RemoteSpace* space = static_cast(ptr); - auto remote_ctx = RemoveRPCSessionMask(ctx); + auto remote_dev = RemoveRPCSessionMask(dev); try { - GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data); + GetSess(dev)->GetDeviceAPI(remote_dev)->FreeDataSpace(remote_dev, space->data); } catch (const Error& e) { // fault tolerance to remote close. } @@ -79,54 +78,54 @@ class RPCDeviceAPI final : public DeviceAPI { } void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { - DLContext ctx_from = from->ctx; - DLContext ctx_to = to->ctx; - if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) { - ICHECK(ctx_from.device_type == ctx_to.device_type) + DLDevice dev_from = from->device; + DLDevice dev_to = to->device; + if (IsRPCSessionDevice(dev_from) && IsRPCSessionDevice(dev_to)) { + ICHECK(dev_from.device_type == dev_to.device_type) << "Cannot copy across two different remote session"; DLTensor from_tensor = *from; - from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.device = RemoveRPCSessionMask(dev_from); from_tensor.data = static_cast(from->data)->data; DLTensor to_tensor = *to; - to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.device = RemoveRPCSessionMask(dev_to); to_tensor.data = static_cast(to->data)->data; - auto remote_ctx = from_tensor.ctx; - if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx; - GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream); - } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) { + auto remote_dev = from_tensor.device; + if (remote_dev.device_type == kDLCPU) remote_dev = to_tensor.device; + GetSess(dev_from)->GetDeviceAPI(remote_dev)->CopyDataFromTo(&from_tensor, &to_tensor, stream); + } else if (IsRPCSessionDevice(dev_from) && dev_to.device_type == kDLCPU) { DLTensor from_tensor = *from; - from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.device = RemoveRPCSessionMask(dev_from); from_tensor.data = static_cast(from->data)->data; void* to_bytes = static_cast(to->data) + to->byte_offset; size_t nbytes = GetDataSize(*to); - GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes); - } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) { + GetSess(dev_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes); + } else if (dev_from.device_type == kDLCPU && IsRPCSessionDevice(dev_to)) { DLTensor to_tensor = *to; - to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.device = RemoveRPCSessionMask(dev_to); to_tensor.data = static_cast(to->data)->data; void* from_bytes = static_cast(from->data) + from->byte_offset; size_t nbytes = GetDataSize(*from); - GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes); + GetSess(dev_to)->CopyToRemote(from_bytes, &to_tensor, nbytes); } else { LOG(FATAL) << "expect copy from/to remote or between remote"; } } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - auto remote_ctx = RemoveRPCSessionMask(ctx); - GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream); + void StreamSync(Device dev, TVMStreamHandle stream) final { + auto remote_dev = RemoveRPCSessionMask(dev); + GetSess(dev)->GetDeviceAPI(remote_dev)->StreamSync(remote_dev, stream); } protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) final { + size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint, + TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented."; } private: - std::shared_ptr GetSess(TVMContext ctx) { - int tbl_index = GetRPCSessionIndex(ctx); + std::shared_ptr GetSess(Device dev) { + int tbl_index = GetRPCSessionIndex(dev); return RPCSession::Get(tbl_index); } }; diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc index 5e2bba88921e8..b5768146b3f76 100644 --- a/src/runtime/rpc/rpc_endpoint.cc +++ b/src/runtime/rpc/rpc_endpoint.cc @@ -176,10 +176,9 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { if (tcode == kTVMObjectHandle || tcode == kTVMObjectRValueRefArg) { LOG(FATAL) << "ValueError: Cannot pass argument " << i << ", type " << args[i].AsObjectRef()->GetTypeKey() << " is not supported by RPC"; - } else if (tcode == kTVMContext) { - DLContext ctx = args[i]; - ICHECK(!IsRPCSessionContext(ctx)) - << "InternalError: cannot pass RPC context in the channel"; + } else if (tcode == kDLDevice) { + DLDevice dev = args[i]; + ICHECK(!IsRPCSessionDevice(dev)) << "InternalError: cannot pass RPC device in the channel"; } } } @@ -405,7 +404,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { + if (arr->device.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { char* data_ptr = reinterpret_cast(arr->data) + arr->byte_offset; fcopyack(data_ptr, data_bytes); } else { @@ -438,7 +437,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) { + if (arr->device.device_type == kDLCPU && sess->IsLocalSession()) { char* dptr = reinterpret_cast(arr->data) + arr->byte_offset; this->ReadArray(dptr, data_bytes); @@ -550,11 +549,11 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { void HandleSyscallStreamSync() { TVMArgs args = RecvPackedSeq(); try { - TVMContext ctx = args[0]; + Device dev = args[0]; TVMStreamHandle handle = args[1]; this->SwitchToState(kWaitForAsyncCallback); - GetServingSession()->AsyncStreamWait(ctx, handle, [this](RPCCode status, TVMArgs args) { + GetServingSession()->AsyncStreamWait(dev, handle, [this](RPCCode status, TVMArgs args) { if (status == RPCCode::kException) { this->ReturnException(args.values[0].v_str); } else { @@ -807,7 +806,7 @@ void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) uint64_t to_data = reinterpret_cast(to->data); uint64_t shape_bytes = to->ndim * sizeof(int64_t); - uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) + + uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->device) + sizeof(to->ndim) + sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes + sizeof(nbytes) + num_data_bytes; @@ -828,7 +827,7 @@ void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes uint64_t from_data = reinterpret_cast(from->data); uint64_t shape_bytes = from->ndim * sizeof(int64_t); - uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) + + uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->device) + sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) + shape_bytes + sizeof(nbytes); @@ -855,37 +854,37 @@ void RPCFreeHandle(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { } void RPCDevSetDevice(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - TVMContext ctx = args[0]; - handler->GetDeviceAPI(ctx)->SetDevice(ctx); + Device dev = args[0]; + handler->GetDeviceAPI(dev)->SetDevice(dev); } void RPCDevGetAttr(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - TVMContext ctx = args[0]; + Device dev = args[0]; DeviceAttrKind kind = static_cast(args[1].operator int()); if (kind == kExist) { - DeviceAPI* api = handler->GetDeviceAPI(ctx, true); + DeviceAPI* api = handler->GetDeviceAPI(dev, true); if (api != nullptr) { - api->GetAttr(ctx, kind, rv); + api->GetAttr(dev, kind, rv); } else { *rv = 0; } } else { - handler->GetDeviceAPI(ctx)->GetAttr(ctx, static_cast(kind), rv); + handler->GetDeviceAPI(dev)->GetAttr(dev, static_cast(kind), rv); } } void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - TVMContext ctx = args[0]; + Device dev = args[0]; uint64_t nbytes = args[1]; uint64_t alignment = args[2]; DLDataType type_hint = args[3]; - void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, nbytes, alignment, type_hint); + void* data = handler->GetDeviceAPI(dev)->AllocDataSpace(dev, nbytes, alignment, type_hint); *rv = data; } void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { DLTensor* arr = args[0]; - TVMContext ctx = arr->ctx; + Device dev = arr->device; int ndim = arr->ndim; int64_t* shape = arr->shape; DLDataType dtype = arr->dtype; @@ -896,14 +895,14 @@ void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv } else { ICHECK_EQ(tcode, kTVMNullptr); } - void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope); + void* data = handler->GetDeviceAPI(dev)->AllocDataSpace(dev, ndim, shape, dtype, mem_scope); *rv = data; } void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - TVMContext ctx = args[0]; + Device dev = args[0]; void* ptr = args[1]; - handler->GetDeviceAPI(ctx)->FreeDataSpace(ctx, ptr); + handler->GetDeviceAPI(dev)->FreeDataSpace(dev, ptr); } void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { @@ -911,14 +910,14 @@ void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { DLTensor* to = args[1]; TVMStreamHandle stream = args[2]; - TVMContext ctx = from->ctx; - if (ctx.device_type == kDLCPU) { - ctx = to->ctx; + Device dev = from->device; + if (dev.device_type == kDLCPU) { + dev = to->device; } else { - ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type) - << "Can not copy across different ctx types directly"; + ICHECK(to->device.device_type == kDLCPU || to->device.device_type == from->device.device_type) + << "Can not copy across different dev types directly"; } - handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream); + handler->GetDeviceAPI(dev)->CopyDataFromTo(from, to, stream); } void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) { @@ -993,27 +992,26 @@ class RPCClientSession : public RPCSession, public DeviceAPI { endpoint_->SysCallRemote(RPCCode::kFreeHandle, handle, type_code); } - void SetDevice(TVMContext ctx) final { endpoint_->SysCallRemote(RPCCode::kDevSetDevice, ctx); } + void SetDevice(Device dev) final { endpoint_->SysCallRemote(RPCCode::kDevSetDevice, dev); } - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { - if (ctx.device_type == kDLCPU && kind == kExist) { + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { + if (dev.device_type == kDLCPU && kind == kExist) { // cpu always exists. *rv = 1; } else { - *rv = endpoint_->SysCallRemote(RPCCode::kDevGetAttr, ctx, static_cast(kind)); + *rv = endpoint_->SysCallRemote(RPCCode::kDevGetAttr, dev, static_cast(kind)); } } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { - return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint); + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { + return endpoint_->SysCallRemote(RPCCode::kDevAllocData, dev, nbytes, alignment, type_hint); } - void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) final { DLTensor temp; temp.data = nullptr; - temp.ctx = ctx; + temp.device = dev; temp.ndim = ndim; temp.dtype = dtype; temp.shape = const_cast(shape); @@ -1027,19 +1025,19 @@ class RPCClientSession : public RPCSession, public DeviceAPI { } } - void FreeDataSpace(TVMContext ctx, void* ptr) final { - endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr); + void FreeDataSpace(Device dev, void* ptr) final { + endpoint_->SysCallRemote(RPCCode::kDevFreeData, dev, ptr); } void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream); } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - endpoint_->SysCallRemote(RPCCode::kDevStreamSync, ctx, stream); + void StreamSync(Device dev, TVMStreamHandle stream) final { + endpoint_->SysCallRemote(RPCCode::kDevStreamSync, dev, stream); } - DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing) final { return this; } + DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing) final { return this; } bool IsLocalSession() const final { return false; } diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h index 8e08bfa75623a..cd3c9b2bec72f 100644 --- a/src/runtime/rpc/rpc_endpoint.h +++ b/src/runtime/rpc/rpc_endpoint.h @@ -132,7 +132,7 @@ class RPCEndpoint { * \param to The target array. * \param to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param ctx_to The target context. + * \param dev_to The target device. * \param type_hint Hint of content data type. */ void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes); @@ -143,7 +143,7 @@ class RPCEndpoint { * \param to The target array. * \param to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param ctx_from The source context. + * \param dev_from The source device. * \param type_hint Hint of content data type. */ void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes); diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc index 0650b55d0d7ce..4b1c1f7fe9988 100644 --- a/src/runtime/rpc/rpc_local_session.cc +++ b/src/runtime/rpc/rpc_local_session.cc @@ -91,35 +91,35 @@ void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) ICHECK_EQ(nbytes, GetDataSize(*to)); DLTensor from; from.data = from_bytes; - from.ctx = {kDLCPU, 0}; + from.device = {kDLCPU, 0}; from.ndim = to->ndim; from.shape = to->shape; from.dtype = to->dtype; from.strides = nullptr; from.byte_offset = 0; - TVMContext ctx_to = to->ctx; - this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr); + Device dev_to = to->device; + this->GetDeviceAPI(dev_to)->CopyDataFromTo(&from, to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed - this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr); + this->GetDeviceAPI(dev_to)->StreamSync(dev_to, nullptr); } void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) { ICHECK_EQ(nbytes, GetDataSize(*from)); DLTensor to; to.data = to_bytes; - to.ctx = {kDLCPU, 0}; + to.device = {kDLCPU, 0}; to.ndim = from->ndim; to.shape = from->shape; to.dtype = from->dtype; to.strides = nullptr; to.byte_offset = 0; - TVMContext ctx_from = from->ctx; - this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr); + Device dev_from = from->device; + this->GetDeviceAPI(dev_from)->CopyDataFromTo(from, &to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed - this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr); + this->GetDeviceAPI(dev_from)->StreamSync(dev_from, nullptr); } void LocalSession::FreeHandle(void* handle, int type_code) { @@ -129,8 +129,8 @@ void LocalSession::FreeHandle(void* handle, int type_code) { TVMRetValue rv = TVMRetValue::MoveFromCHost(value, type_code); } -DeviceAPI* LocalSession::GetDeviceAPI(TVMContext ctx, bool allow_missing) { - return DeviceAPI::Get(ctx, allow_missing); +DeviceAPI* LocalSession::GetDeviceAPI(Device dev, bool allow_missing) { + return DeviceAPI::Get(dev, allow_missing); } TVM_REGISTER_GLOBAL("rpc.LocalSession").set_body_typed([]() { diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h index ea070e34bd355..d1b54d5be65bf 100644 --- a/src/runtime/rpc/rpc_local_session.h +++ b/src/runtime/rpc/rpc_local_session.h @@ -54,7 +54,7 @@ class LocalSession : public RPCSession { void FreeHandle(void* handle, int type_code) override; - DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing = false) override; + DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing = false) override; bool IsLocalSession() const override { return true; } diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 46e1be7945204..7db84862604fe 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -56,21 +56,21 @@ static void RemoteNDArrayDeleter(Object* obj) { * underlying DLTensor. * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly * created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg. - * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask. + * \param dev Remote device used with this tensor. Must have non-zero RPCSessMask. * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray. */ NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr sess, void* handle, - DLTensor* template_tensor, TVMContext ctx, + DLTensor* template_tensor, Device dev, void* remote_ndarray_handle) { - ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx)) - << "The TVMContext given does not belong to the given session"; + ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(dev)) + << "The Device given does not belong to the given session"; RemoteSpace* space = new RemoteSpace(); space->sess = sess; space->data = handle; std::vector shape_vec{template_tensor->shape, template_tensor->shape + template_tensor->ndim}; NDArray::Container* data = new NDArray::Container(static_cast(space), std::move(shape_vec), - template_tensor->dtype, ctx); + template_tensor->dtype, dev); data->manager_ctx = remote_ndarray_handle; data->SetDeleter(RemoteNDArrayDeleter); return NDArray(GetObjectPtr(data)); @@ -106,14 +106,14 @@ class RPCWrappedFunc : public Object { type_codes[i] = kTVMDLTensorHandle; // translate to a remote view of DLTensor auto dptr = std::make_unique(*static_cast(values[i].v_handle)); - dptr->ctx = RemoveSessMask(dptr->ctx); + dptr->device = RemoveSessMask(dptr->device); dptr->data = static_cast(dptr->data)->data; values[i].v_handle = dptr.get(); temp_dltensors.emplace_back(std::move(dptr)); break; } - case kTVMContext: { - values[i].v_ctx = RemoveSessMask(values[i].v_ctx); + case kDLDevice: { + values[i].v_device = RemoveSessMask(values[i].v_device); break; } case kTVMPackedFuncHandle: @@ -147,11 +147,11 @@ class RPCWrappedFunc : public Object { void WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) const; // remove a remote session mask - TVMContext RemoveSessMask(TVMContext ctx) const { - ICHECK(IsRPCSessionContext(ctx)) << "Can not pass in local context"; - ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index()) - << "Can not pass in context with a different remote session"; - return RemoveRPCSessionMask(ctx); + Device RemoveSessMask(Device dev) const { + ICHECK(IsRPCSessionDevice(dev)) << "Can not pass in local device"; + ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) + << "Can not pass in device with a different remote session"; + return RemoveRPCSessionMask(dev); } }; @@ -188,21 +188,21 @@ class RPCModuleNode final : public ModuleNode { return ""; } - PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, int repeat, + PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, int min_repeat_ms, const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); - // Remove session mask because we pass ctx by parts. - ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index()) - << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator"; - ctx = RemoveRPCSessionMask(ctx); + // Remove session mask because we pass dev by parts. + ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index()) + << "ValueError: Need to pass the matched remote device to RPCModule.GetTimeEvaluator"; + dev = RemoveRPCSessionMask(dev); if (module_handle_ != nullptr) { return remote_get_time_evaluator_(GetRef(this), name, - static_cast(ctx.device_type), ctx.device_id, number, + static_cast(dev.device_type), dev.device_id, number, repeat, min_repeat_ms, f_preproc_name); } else { return remote_get_time_evaluator_(Optional(nullptr), name, - static_cast(ctx.device_type), ctx.device_id, number, + static_cast(dev.device_type), dev.device_id, number, repeat, min_repeat_ms, f_preproc_name); } } @@ -286,7 +286,7 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons DLTensor* tensor = args[1]; void* nd_handle = args[2]; *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor, - AddRPCSessionMask(tensor->ctx, sess_->table_index()), + AddRPCSessionMask(tensor->device, sess_->table_index()), nd_handle); } else { ICHECK_EQ(args.size(), 2); @@ -342,24 +342,24 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { } } -PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat, - int min_repeat_ms, PackedFunc f_preproc) { +PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, + PackedFunc f_preproc) { ICHECK(pf != nullptr); - if (static_cast(ctx.device_type) == static_cast(kDLMicroDev)) { + if (static_cast(dev.device_type) == static_cast(kDLMicroDev)) { auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator"); ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled"; - return (*get_micro_time_evaluator)(pf, ctx, number, repeat); + return (*get_micro_time_evaluator)(pf, dev, number, repeat); } - auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_preproc](TVMArgs args, + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); + DeviceAPI::Get(dev)->StreamSync(dev, nullptr); for (int i = 0; i < repeat; ++i) { if (f_preproc != nullptr) { @@ -373,7 +373,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe number * 1.618)); // 1.618 is chosen by random } - Timer t = Timer::Start(ctx); + Timer t = Timer::Start(dev); // start timing for (int i = 0; i < number; ++i) { pf.CallPacked(args, &temp); @@ -400,15 +400,15 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms, std::string f_preproc_name) { - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; if (opt_mod.defined()) { Module m = opt_mod.value(); std::string tkey = m->type_key(); if (tkey == "rpc") { return static_cast(m.operator->()) - ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_preproc_name); + ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, f_preproc_name); } else { PackedFunc f_preproc; if (!f_preproc_name.empty()) { @@ -417,7 +417,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") << "Cannot find " << f_preproc_name << " in the global function"; f_preproc = *pf_preproc; } - return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms, + return WrapTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms, f_preproc); } } else { @@ -430,7 +430,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") << "Cannot find " << f_preproc_name << " in the global function"; f_preproc = *pf_preproc; } - return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_preproc); + return WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc); } }); @@ -469,10 +469,10 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue* }); TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle") - .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx, + .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, Device dev, void* ndarray_handle) -> NDArray { return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array, template_tensor, - ctx, ndarray_handle); + dev, ndarray_handle); }); } // namespace runtime diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 2b75018099d5f..df4f1ce42998a 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -79,14 +79,14 @@ void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes } } -void RPCSession::AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, +void RPCSession::AsyncStreamWait(Device dev, TVMStreamHandle stream, RPCSession::FAsyncCallback callback) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; try { - this->GetDeviceAPI(ctx)->StreamSync(ctx, stream); + this->GetDeviceAPI(dev)->StreamSync(dev, stream); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); } catch (const std::exception& e) { this->SendException(callback, e.what()); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 4b942f2230bac..8923103157d56 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -157,12 +157,12 @@ class RPCSession { * The device API is guaranteed to be alive during the * lifetime of the Session. * - * \param ctx The remote context. + * \param dev The remote device. * \param allow_missing Whether can we return nullptr if it is not available. * * \return The device API. */ - virtual DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing = false) = 0; + virtual DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing = false) = 0; /*! * \brief Whether the session is a local session and we can directly @@ -234,12 +234,12 @@ class RPCSession { virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, FAsyncCallback on_complete); /*! - * \brief Asynchrously wait for all events in ctx, stream compeletes. - * \param ctx The device context. + * \brief Asynchrously wait for all events in dev, stream compeletes. + * \param dev The device. * \param stream The stream to wait on. * \param on_complete The callback to signal copy complete. */ - virtual void AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, FAsyncCallback on_compelte); + virtual void AsyncStreamWait(Device dev, TVMStreamHandle stream, FAsyncCallback on_compelte); /*! * \return The session table index of the session. @@ -272,7 +272,7 @@ class RPCSession { /*! * \brief Remote space handle cell used by the RPC runtime API. * - * When we allocate space using a rpc context, the data pointer + * When we allocate space using a rpc device, the data pointer * points to an allocated RemoteSpace. */ struct RemoteSpace { @@ -285,7 +285,7 @@ struct RemoteSpace { /*! * \brief Wrap a timer function to measure the time cost of a given packed function. * \param f The function argument. - * \param ctx The context. + * \param dev The device. * \param number The number of times to run this function for taking average. * We call these runs as one `repeat` of measurement. * \param repeat The number of times to repeat the measurement. @@ -302,8 +302,8 @@ struct RemoteSpace { * \param f_preproc The function to be executed before we excetute time evaluator. * \return f_timer A timer function. */ -PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat, - int min_repeat_ms, PackedFunc f_preproc = nullptr); +PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, + PackedFunc f_preproc = nullptr); /*! * \brief Create a Global RPC module that refers to the session. diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc index 4a5211e9c8292..808dc4063c8b7 100644 --- a/src/runtime/stackvm/stackvm.cc +++ b/src/runtime/stackvm/stackvm.cc @@ -478,11 +478,11 @@ void StackVM::Run(State* s) const { break; } case StackVM::kArrDeviceId: { - stack[sp].v_int64 = arr[index].ctx.device_id; + stack[sp].v_int64 = arr[index].device.device_id; break; } case StackVM::kArrDeviceType: { - stack[sp].v_int64 = static_cast(arr[index].ctx.device_type); + stack[sp].v_int64 = static_cast(arr[index].device.device_type); break; } case StackVM::kArrAddr: { @@ -537,11 +537,11 @@ void StackVM::Run(State* s) const { break; } case StackVM::kArrDeviceId: { - arr[index].ctx.device_id = static_cast(stack[sp].v_int64); + arr[index].device.device_id = static_cast(stack[sp].v_int64); break; } case StackVM::kArrDeviceType: { - arr[index].ctx.device_type = static_cast(stack[sp].v_int64); + arr[index].device.device_type = static_cast(stack[sp].v_int64); break; } case StackVM::kTVMValueContent: { diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc index 960b2e20145a4..fa5edf18335c8 100644 --- a/src/runtime/vm/memory_manager.cc +++ b/src/runtime/vm/memory_manager.cc @@ -37,7 +37,7 @@ static void BufferDeleter(Object* obj) { auto* ptr = static_cast(obj); ICHECK(ptr->manager_ctx != nullptr); Buffer* buffer = reinterpret_cast(ptr->manager_ctx); - MemoryManager::GetAllocator(buffer->ctx)->Free(*(buffer)); + MemoryManager::GetAllocator(buffer->device)->Free(*(buffer)); delete buffer; delete ptr; } @@ -80,7 +80,8 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector shape, DLDa VerifyDataType(dtype); // crtical zone: allocate header, cannot throw - NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, this->buffer.ctx); + NDArray::Container* container = + new NDArray::Container(nullptr, shape, dtype, this->buffer.device); container->SetDeleter(StorageObj::Deleter); size_t needed_size = GetDataSize(container->dl_tensor); @@ -116,54 +117,54 @@ MemoryManager* MemoryManager::Global() { return inst; } -Allocator* MemoryManager::GetOrCreateAllocator(TVMContext ctx, AllocatorType type) { +Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) { MemoryManager* m = MemoryManager::Global(); std::lock_guard lock(m->mu_); - if (m->allocators_.find(ctx) == m->allocators_.end()) { + if (m->allocators_.find(dev) == m->allocators_.end()) { std::unique_ptr alloc; switch (type) { case kNaive: { - DLOG(INFO) << "New naive allocator for " << DeviceName(ctx.device_type) << "(" - << ctx.device_id << ")"; - alloc.reset(new NaiveAllocator(ctx)); + DLOG(INFO) << "New naive allocator for " << DeviceName(dev.device_type) << "(" + << dev.device_id << ")"; + alloc.reset(new NaiveAllocator(dev)); break; } case kPooled: { - DLOG(INFO) << "New pooled allocator for " << DeviceName(ctx.device_type) << "(" - << ctx.device_id << ")"; - alloc.reset(new PooledAllocator(ctx)); + DLOG(INFO) << "New pooled allocator for " << DeviceName(dev.device_type) << "(" + << dev.device_id << ")"; + alloc.reset(new PooledAllocator(dev)); break; } default: LOG(FATAL) << "Unknown allocator type: " << type; } auto ret = alloc.get(); - m->allocators_.emplace(ctx, std::move(alloc)); + m->allocators_.emplace(dev, std::move(alloc)); return ret; } - auto alloc = m->allocators_.at(ctx).get(); + auto alloc = m->allocators_.at(dev).get(); if (alloc->type() != type) { - LOG(WARNING) << "The type of existing allocator for " << DeviceName(ctx.device_type) << "(" - << ctx.device_id << ") is different from the request type (" << alloc->type() + LOG(WARNING) << "The type of existing allocator for " << DeviceName(dev.device_type) << "(" + << dev.device_id << ") is different from the request type (" << alloc->type() << " vs " << type << ")"; } return alloc; } -Allocator* MemoryManager::GetAllocator(TVMContext ctx) { +Allocator* MemoryManager::GetAllocator(Device dev) { MemoryManager* m = MemoryManager::Global(); std::lock_guard lock(m->mu_); - auto it = m->allocators_.find(ctx); + auto it = m->allocators_.find(dev); if (it == m->allocators_.end()) { - LOG(FATAL) << "Allocator for " << DeviceName(ctx.device_type) << "(" << ctx.device_id + LOG(FATAL) << "Allocator for " << DeviceName(dev.device_type) << "(" << dev.device_id << ") has not been created yet."; } return it->second.get(); } -NDArray Allocator::Empty(std::vector shape, DLDataType dtype, DLContext ctx) { +NDArray Allocator::Empty(std::vector shape, DLDataType dtype, DLDevice dev) { VerifyDataType(dtype); - NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, ctx); + NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, dev); container->SetDeleter(BufferDeleter); size_t size = GetDataSize(container->dl_tensor); size_t alignment = GetDataAlignment(container->dl_tensor); diff --git a/src/runtime/vm/naive_allocator.h b/src/runtime/vm/naive_allocator.h index 301acf81a9c99..9fce66f606694 100644 --- a/src/runtime/vm/naive_allocator.h +++ b/src/runtime/vm/naive_allocator.h @@ -34,20 +34,20 @@ namespace vm { class NaiveAllocator final : public Allocator { public: - explicit NaiveAllocator(TVMContext ctx) : Allocator(kNaive), used_memory_(0), ctx_(ctx) {} + explicit NaiveAllocator(Device dev) : Allocator(kNaive), used_memory_(0), device_(dev) {} Buffer Alloc(size_t nbytes, size_t alignment, DLDataType type_hint) override { Buffer buf; - buf.ctx = ctx_; + buf.device = device_; buf.size = nbytes; - buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, nbytes, alignment, type_hint); + buf.data = DeviceAPI::Get(device_)->AllocDataSpace(device_, nbytes, alignment, type_hint); used_memory_.fetch_add(nbytes, std::memory_order_relaxed); DLOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B"; return buf; } void Free(const Buffer& buffer) override { - DeviceAPI::Get(ctx_)->FreeDataSpace(buffer.ctx, buffer.data); + DeviceAPI::Get(device_)->FreeDataSpace(buffer.device, buffer.data); used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed); DLOG(INFO) << "free " << buffer.size << " B, used memory " << used_memory_ << " B"; } @@ -56,7 +56,7 @@ class NaiveAllocator final : public Allocator { private: std::atomic used_memory_; - TVMContext ctx_; + Device device_; }; } // namespace vm diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h index 4226ef74daa4b..bb088c5653f2a 100644 --- a/src/runtime/vm/pooled_allocator.h +++ b/src/runtime/vm/pooled_allocator.h @@ -39,8 +39,8 @@ class PooledAllocator final : public Allocator { public: static constexpr size_t kDefaultPageSize = 4096; - explicit PooledAllocator(TVMContext ctx, size_t page_size = kDefaultPageSize) - : Allocator(kPooled), page_size_(page_size), used_memory_(0), ctx_(ctx) {} + explicit PooledAllocator(Device dev, size_t page_size = kDefaultPageSize) + : Allocator(kPooled), page_size_(page_size), used_memory_(0), device_(dev) {} ~PooledAllocator() { ReleaseAll(); } @@ -55,9 +55,9 @@ class PooledAllocator final : public Allocator { return ret; } Buffer buf; - buf.ctx = ctx_; + buf.device = device_; buf.size = size; - buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, size, alignment, type_hint); + buf.data = DeviceAPI::Get(device_)->AllocDataSpace(device_, size, alignment, type_hint); used_memory_.fetch_add(size, std::memory_order_relaxed); DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B"; return buf; @@ -80,7 +80,7 @@ class PooledAllocator final : public Allocator { for (auto const& it : memory_pool_) { auto const& pool = it.second; for (auto const& buf : pool) { - DeviceAPI::Get(buf.ctx)->FreeDataSpace(buf.ctx, buf.data); + DeviceAPI::Get(buf.device)->FreeDataSpace(buf.device, buf.data); } } memory_pool_.clear(); @@ -93,7 +93,7 @@ class PooledAllocator final : public Allocator { std::atomic used_memory_; std::unordered_map > memory_pool_; std::mutex mu_; - TVMContext ctx_; + Device device_; }; } // namespace vm diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index fc01a754ca50b..3f3dee437cb35 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -113,9 +113,8 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) { void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count, Index output_size, const std::vector& args) { ICHECK(exec_); - ICHECK(!ctxs_.empty()) << "Context has not been initialized yet."; - // The device context of any input of the operator is used for - // synchronization. + ICHECK(!devices_.empty()) << "Device has not been initialized yet."; + // The device of any input of the operator is used for synchronization. ICHECK_GT(arg_count, 0U); ObjectRef arg = args[0]; while (arg->IsInstance()) { @@ -124,9 +123,9 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun } ICHECK(arg->IsInstance()); auto nd_array = Downcast(arg); - auto ctx = nd_array->ctx; + auto dev = nd_array->device; - Timer t = Timer::Start(ctx); + Timer t = Timer::Start(dev); VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args); t->Stop(); diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 4683398b01d45..ee06da83bd920 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -64,11 +64,11 @@ std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) { return os; } -inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) { +inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) { if (src->IsInstance()) { auto nd_array = Downcast(src); - if (nd_array->ctx.device_type != ctx.device_type) { - return nd_array.CopyTo(ctx); + if (nd_array->device.device_type != dev.device_type) { + return nd_array.CopyTo(dev); } return src; } else { @@ -77,7 +77,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) { std::vector ret; ADT adt = Downcast(src); for (size_t i = 0; i < adt.size(); i++) { - ret.push_back(CopyTo(adt[i], ctx)); + ret.push_back(CopyTo(adt[i], dev)); } return ADT(adt->tag, ret.begin(), ret.end()); } @@ -135,18 +135,18 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name, } else if (name == "init") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { ICHECK_EQ(args.size() % 3, 0); - std::vector contexts; + std::vector devices; std::vector alloc_types; for (int i = 0; i < args.size() / 3; ++i) { - TVMContext ctx; + Device dev; int device_type = args[i * 3]; - ctx.device_type = DLDeviceType(device_type); - ctx.device_id = args[i * 3 + 1]; + dev.device_type = DLDeviceType(device_type); + dev.device_id = args[i * 3 + 1]; int type = args[i * 3 + 2]; - contexts.push_back(ctx); + devices.push_back(dev); alloc_types.push_back(AllocatorType(type)); } - this->Init(contexts, alloc_types); + this->Init(devices, alloc_types); }); } else if (name == "set_input") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -164,8 +164,8 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name, std::vector func_args(param_names.size()); for (int i = 1; i < args.size(); ++i) { Index device_type = vm_func.params_device_type[i - 1]; - DLContext ctx = GetContext(device_type); - ObjectRef obj = CopyTo(args[i], ctx); + Device dev = GetDevice(device_type); + ObjectRef obj = CopyTo(args[i], dev); func_args[i - 1] = obj; } inputs_.erase(func_name); @@ -177,13 +177,13 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name, } } -inline TVMContext VirtualMachine::GetContext(Index device_type) const { - ICHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type; +inline Device VirtualMachine::GetDevice(Index device_type) const { + ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type; - auto ctx = ctxs_[device_type]; - ICHECK_EQ(static_cast(ctx.device_type), device_type) - << "device type " << device_type << " has not been initialized int the context list."; - return ctx; + auto dev = devices_[device_type]; + ICHECK_EQ(static_cast(dev.device_type), device_type) + << "device type " << device_type << " has not been initialized in the device list."; + return dev; } void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) { @@ -301,18 +301,18 @@ void VirtualMachine::LoadExecutable(const Executable* exec) { } } -void VirtualMachine::Init(const std::vector& ctxs, +void VirtualMachine::Init(const std::vector& devs, const std::vector& alloc_types) { - ICHECK_EQ(ctxs.size(), alloc_types.size()); - // Cache the context - for (size_t i = 0; i < ctxs.size(); i++) { - auto dev_type = static_cast(ctxs[i].device_type); - auto alloc = MemoryManager::GetOrCreateAllocator(ctxs[i], alloc_types[i]); - if (ctxs_.size() <= dev_type) { - ctxs_.resize(dev_type + 1); + ICHECK_EQ(devs.size(), alloc_types.size()); + // Cache the device + for (size_t i = 0; i < devs.size(); i++) { + auto dev_type = static_cast(devs[i].device_type); + auto alloc = MemoryManager::GetOrCreateAllocator(devs[i], alloc_types[i]); + if (devices_.size() <= dev_type) { + devices_.resize(dev_type + 1); allocators_.resize(dev_type + 1); } - ctxs_[dev_type] = ctxs[i]; + devices_[dev_type] = devs[i]; allocators_[dev_type] = alloc; } } @@ -388,8 +388,8 @@ void VirtualMachine::RunLoop() { } if (!const_pool_[instr.const_index].defined()) { - TVMContext ctx = GetContext(exec_->const_device_type[instr.const_index]); - const_pool_[instr.const_index] = CopyTo(constant_obj, ctx); + Device dev = GetDevice(exec_->const_device_type[instr.const_index]); + const_pool_[instr.const_index] = CopyTo(constant_obj, dev); } WriteRegister(instr.dst, const_pool_[instr.const_index]); pc_++; @@ -497,9 +497,9 @@ void VirtualMachine::RunLoop() { goto main_loop; } case Opcode::AllocTensorReg: { - DLContext cpu_ctx = GetContext(static_cast(kDLCPU)); + Device cpu_dev = GetDevice(static_cast(kDLCPU)); auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register); - NDArray shape_tensor = Downcast(CopyTo(shape_obj, cpu_ctx)); + NDArray shape_tensor = Downcast(CopyTo(shape_obj, cpu_dev)); auto shape = ToShape(shape_tensor); auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage); auto storage = Downcast(storage_obj); @@ -542,7 +542,7 @@ void VirtualMachine::RunLoop() { ICHECK_LT(static_cast(dev_type), allocators_.size()) << "Memory allocator for device " << dev_type << " has not been initialized"; auto* alloc = allocators_[dev_type]; - ICHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?"; + ICHECK(alloc) << "Did you forget to init the VirtualMachine with devices?"; storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint); Storage storage(storage_obj); WriteRegister(instr.dst, storage); @@ -577,12 +577,12 @@ void VirtualMachine::RunLoop() { } } case Opcode::ReshapeTensor: { - DLContext cpu_ctx = GetContext(static_cast(kDLCPU)); + Device cpu_dev = GetDevice(static_cast(kDLCPU)); auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor); NDArray tensor_arr = Downcast(tensor_obj); // Read the shape from shape tensor auto shape_obj = ReadRegister(instr.reshape_tensor.newshape); - NDArray shape_tensor = Downcast(CopyTo(shape_obj, cpu_ctx)); + NDArray shape_tensor = Downcast(CopyTo(shape_obj, cpu_dev)); const DLTensor* dl_tensor = shape_tensor.operator->(); ICHECK_EQ(dl_tensor->dtype.code, 0u); ICHECK_EQ(dl_tensor->dtype.bits, 64); @@ -598,14 +598,14 @@ void VirtualMachine::RunLoop() { case Opcode::DeviceCopy: { auto tensor_src = ReadRegister(instr.src); NDArray src_data = Downcast(tensor_src); - DLContext src_ctx = src_data->ctx; - ICHECK_EQ(static_cast(src_ctx.device_type), instr.src_device_type); + Device src_dev = src_data->device; + ICHECK_EQ(static_cast(src_dev.device_type), instr.src_device_type); - DLContext dst_ctx; - dst_ctx.device_type = static_cast(instr.dst_device_type); - dst_ctx.device_id = 0; + Device dst_dev; + dst_dev.device_type = static_cast(instr.dst_device_type); + dst_dev.device_id = 0; - NDArray dst_data = src_data.CopyTo(dst_ctx); + NDArray dst_data = src_data.CopyTo(dst_dev); WriteRegister(instr.dst, dst_data); pc_++; goto main_loop; diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index f56318aee94d7..5cd4812f41c4f 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -76,7 +76,7 @@ class VulkanThreadEntry { } } - TVMContext ctx; + Device device; std::unique_ptr pool; VulkanStream* Stream(size_t device_id); VulkanStagingBuffer* StagingBuffer(int device_id, size_t size); @@ -115,16 +115,15 @@ class VulkanDeviceAPI final : public DeviceAPI { vkDestroyInstance(instance_, nullptr); } } - void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; } - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; + void SetDevice(Device dev) final { VulkanThreadEntry::ThreadLocal()->device = dev; } + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final; std::vector GetComputeQueueFamilies(VkPhysicalDevice phy_dev); - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { if (nbytes == 0) { // Vulkan seems to have issues if we return nullptr on zero size alloc nbytes = 1; } - const auto& vctx = context(ctx.device_id); + const auto& vctx = context(dev.device_id); VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; info.pNext = nullptr; @@ -191,12 +190,12 @@ class VulkanDeviceAPI final : public DeviceAPI { return pbuf; } - void FreeDataSpace(TVMContext ctx, void* ptr) final { + void FreeDataSpace(Device dev, void* ptr) final { // Before releasing the vkBuffer, call sync to // finish all the vulkan commands that reference the buffer. - StreamSync(ctx, nullptr); + StreamSync(dev, nullptr); - const auto& vctx = context(ctx.device_id); + const auto& vctx = context(dev.device_id); auto* pbuf = static_cast(ptr); vkDestroyBuffer(vctx.device, pbuf->buffer, nullptr); vkFreeMemory(vctx.device, pbuf->memory, nullptr); @@ -205,19 +204,19 @@ class VulkanDeviceAPI final : public DeviceAPI { protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { ICHECK(stream == nullptr); - TVMContext ctx = ctx_from; - if (ctx_from.device_type == kDLCPU) { - ctx = ctx_to; + Device dev = dev_from; + if (dev_from.device_type == kDLCPU) { + dev = dev_to; } - int from_dev_type = static_cast(ctx_from.device_type); - int to_dev_type = static_cast(ctx_to.device_type); + int from_dev_type = static_cast(dev_from.device_type); + int to_dev_type = static_cast(dev_to.device_type); if (from_dev_type == kDLVulkan && to_dev_type == kDLVulkan) { VulkanThreadEntry::ThreadLocal() - ->Stream(ctx_from.device_id) + ->Stream(dev_from.device_id) ->Launch([=](VulkanStreamState* state) { // 1: copy const auto* from_buf = static_cast(from); @@ -228,7 +227,7 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.size = size; vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, ©_info); // 2: barrier(transfer-> compute|transfer) - ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy."; + ICHECK_EQ(dev_from.device_id, dev_to.device_id) << "Vulkan disallow cross device copy."; VkMemoryBarrier barrier_info; barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; barrier_info.pNext = nullptr; @@ -244,10 +243,10 @@ class VulkanDeviceAPI final : public DeviceAPI { } else if (from_dev_type == kDLVulkan && to_dev_type == kDLCPU) { const auto* from_buf = static_cast(from); - const auto& vctx = context(ctx_from.device_id); - auto* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(ctx_from.device_id, size); + const auto& vctx = context(dev_from.device_id); + auto* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_from.device_id, size); VulkanThreadEntry::ThreadLocal() - ->Stream(ctx_from.device_id) + ->Stream(dev_from.device_id) ->Launch([&](VulkanStreamState* state) { VkBufferCopy copy_info; copy_info.srcOffset = from_offset; @@ -255,7 +254,7 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.size = size; vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->buffer, 1, ©_info); }); - VulkanThreadEntry::ThreadLocal()->Stream(ctx_from.device_id)->Synchronize(); + VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); if (!vctx.coherent_staging) { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; @@ -267,10 +266,10 @@ class VulkanDeviceAPI final : public DeviceAPI { } memcpy(static_cast(to) + to_offset, static_cast(temp->host_addr), size); } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) { - const auto& vctx = context(ctx_to.device_id); + const auto& vctx = context(dev_to.device_id); const auto* to_buf = static_cast(to); VulkanStagingBuffer* temp = - VulkanThreadEntry::ThreadLocal()->StagingBuffer(ctx_to.device_id, size); + VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_to.device_id, size); memcpy(temp->host_addr, static_cast(from) + from_offset, size); // host side flush if access is not coherent. // so writes from CPU is visible to GPU @@ -285,7 +284,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } VulkanThreadEntry::ThreadLocal() - ->Stream(ctx_from.device_id) + ->Stream(dev_from.device_id) ->Launch([&](VulkanStreamState* state) { // 0: barrier(host->transfer) VkMemoryBarrier barrier_info; @@ -305,7 +304,7 @@ class VulkanDeviceAPI final : public DeviceAPI { }); // TODO(tulloch): should we instead make the staging buffer a property of the // Stream? This would allow us to elide synchronizations here. - VulkanThreadEntry::ThreadLocal()->Stream(ctx_from.device_id)->Synchronize(); + VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); } else { LOG(FATAL) << "Expect copy from/to Vulkan or between Vulkan" << ", from=" << from_dev_type << ", to=" << to_dev_type; @@ -314,37 +313,37 @@ class VulkanDeviceAPI final : public DeviceAPI { public: // Always use the default stream - TVMStreamHandle CreateStream(TVMContext ctx) { + TVMStreamHandle CreateStream(Device dev) { LOG(FATAL) << "Not implemented"; return nullptr; } - void FreeStream(TVMContext ctx, TVMStreamHandle stream) { + void FreeStream(Device dev, TVMStreamHandle stream) { LOG(FATAL) << "Not implemented"; return; } - void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) { + void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { LOG(FATAL) << "Not implemented"; return; } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { + void StreamSync(Device dev, TVMStreamHandle stream) final { ICHECK(stream == nullptr); - VulkanThreadEntry::ThreadLocal()->Stream(ctx.device_id)->Synchronize(); + VulkanThreadEntry::ThreadLocal()->Stream(dev.device_id)->Synchronize(); } - void SetStream(TVMContext ctx, TVMStreamHandle stream) final { + void SetStream(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; return; } - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { - return VulkanThreadEntry::ThreadLocal()->pool->AllocWorkspace(ctx, size); + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { + return VulkanThreadEntry::ThreadLocal()->pool->AllocWorkspace(dev, size); } - void FreeWorkspace(TVMContext ctx, void* data) final { - VulkanThreadEntry::ThreadLocal()->pool->FreeWorkspace(ctx, data); + void FreeWorkspace(Device dev, void* data) final { + VulkanThreadEntry::ThreadLocal()->pool->FreeWorkspace(dev, data); } static VulkanDeviceAPI* Global() { @@ -363,8 +362,8 @@ class VulkanDeviceAPI final : public DeviceAPI { std::vector context_; }; -void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) { - size_t index = static_cast(ctx.device_id); +void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) { + size_t index = static_cast(dev.device_id); if (kind == kExist) { *rv = static_cast(index < context_.size()); return; @@ -1045,8 +1044,8 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size VulkanThreadEntry::VulkanThreadEntry() : pool(std::make_unique(static_cast(kDLVulkan), VulkanDeviceAPI::Global())) { - ctx.device_id = 0; - ctx.device_type = static_cast(kDLVulkan); + device.device_id = 0; + device.device_type = static_cast(kDLVulkan); } VulkanStream* VulkanThreadEntry::Stream(size_t device_id) { @@ -1059,7 +1058,7 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) { void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const { - int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id; + int device_id = VulkanThreadEntry::ThreadLocal()->device.device_id; ICHECK_LT(device_id, kVulkanMaxNumDevice); const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); if (!scache_[device_id]) { diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc index 2d347c32ac103..40d488df700af 100644 --- a/src/runtime/workspace_pool.cc +++ b/src/runtime/workspace_pool.cc @@ -43,7 +43,7 @@ class WorkspacePool::Pool { allocated_.push_back(e); } // allocate from pool - void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) { + void* Alloc(Device dev, DeviceAPI* device, size_t nbytes) { // Allocate align to page. nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize; if (nbytes == 0) nbytes = kWorkspacePageSize; @@ -57,12 +57,12 @@ class WorkspacePool::Pool { free_list_.pop_back(); if (e.size < nbytes) { // resize the page - device->FreeDataSpace(ctx, e.data); - e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + device->FreeDataSpace(dev, e.data); + e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type); e.size = nbytes; } } else if (free_list_.size() == 1) { - e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type); e.size = nbytes; } else { if (free_list_.back().size >= nbytes) { @@ -76,8 +76,8 @@ class WorkspacePool::Pool { // resize the page e = free_list_.back(); free_list_.pop_back(); - device->FreeDataSpace(ctx, e.data); - e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type); + device->FreeDataSpace(dev, e.data); + e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type); e.size = nbytes; } } @@ -114,10 +114,10 @@ class WorkspacePool::Pool { } } // Release all resources - void Release(TVMContext ctx, DeviceAPI* device) { + void Release(Device dev, DeviceAPI* device) { ICHECK_EQ(allocated_.size(), 1); for (size_t i = 1; i < free_list_.size(); ++i) { - device->FreeDataSpace(ctx, free_list_[i].data); + device->FreeDataSpace(dev, free_list_[i].data); } free_list_.clear(); } @@ -140,28 +140,28 @@ WorkspacePool::WorkspacePool(DLDeviceType device_type, DeviceAPI* device) WorkspacePool::~WorkspacePool() { for (size_t i = 0; i < array_.size(); ++i) { if (array_[i] != nullptr) { - TVMContext ctx; - ctx.device_type = device_type_; - ctx.device_id = static_cast(i); - array_[i]->Release(ctx, device_); + Device dev; + dev.device_type = device_type_; + dev.device_id = static_cast(i); + array_[i]->Release(dev, device_); delete array_[i]; } } } -void* WorkspacePool::AllocWorkspace(TVMContext ctx, size_t size) { - if (static_cast(ctx.device_id) >= array_.size()) { - array_.resize(ctx.device_id + 1, nullptr); +void* WorkspacePool::AllocWorkspace(Device dev, size_t size) { + if (static_cast(dev.device_id) >= array_.size()) { + array_.resize(dev.device_id + 1, nullptr); } - if (array_[ctx.device_id] == nullptr) { - array_[ctx.device_id] = new Pool(); + if (array_[dev.device_id] == nullptr) { + array_[dev.device_id] = new Pool(); } - return array_[ctx.device_id]->Alloc(ctx, device_, size); + return array_[dev.device_id]->Alloc(dev, device_, size); } -void WorkspacePool::FreeWorkspace(TVMContext ctx, void* ptr) { - ICHECK(static_cast(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr); - array_[ctx.device_id]->Free(ptr); +void WorkspacePool::FreeWorkspace(Device dev, void* ptr) { + ICHECK(static_cast(dev.device_id) < array_.size() && array_[dev.device_id] != nullptr); + array_[dev.device_id]->Free(ptr); } } // namespace runtime diff --git a/src/runtime/workspace_pool.h b/src/runtime/workspace_pool.h index 887afc5cbb578..0db9758fac868 100644 --- a/src/runtime/workspace_pool.h +++ b/src/runtime/workspace_pool.h @@ -54,17 +54,17 @@ class TVM_DLL WorkspacePool { ~WorkspacePool(); /*! * \brief Allocate temporal workspace. - * \param ctx The context of allocation. + * \param dev The device of allocation. * \param size The size to be allocated. */ - void* AllocWorkspace(TVMContext ctx, size_t size); + void* AllocWorkspace(Device dev, size_t size); /*! * \brief Free temporal workspace in backend execution. * - * \param ctx The context of allocation. + * \param dev The device of allocation. * \param ptr The pointer to be freed. */ - void FreeWorkspace(TVMContext ctx, void* ptr); + void FreeWorkspace(Device dev, void* ptr); private: class Pool; diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc index b06a8bb461beb..bac888a81aea7 100644 --- a/src/support/ffi_testing.cc +++ b/src/support/ffi_testing.cc @@ -72,13 +72,13 @@ TVM_REGISTER_GLOBAL("testing.test_check_eq_callback").set_body([](TVMArgs args, runtime::TypedPackedFunc([msg](int x, int y) { CHECK_EQ(x, y) << msg; }); }); -TVM_REGISTER_GLOBAL("testing.context_test").set_body([](TVMArgs args, TVMRetValue* ret) { - DLContext ctx = args[0]; +TVM_REGISTER_GLOBAL("testing.device_test").set_body([](TVMArgs args, TVMRetValue* ret) { + Device dev = args[0]; int dtype = args[1]; int did = args[2]; - CHECK_EQ(static_cast(ctx.device_type), dtype); - CHECK_EQ(static_cast(ctx.device_id), did); - *ret = ctx; + CHECK_EQ(static_cast(dev.device_type), dtype); + CHECK_EQ(static_cast(dev.device_id), did); + *ret = dev; }); // in src/api_test.cc diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index ca21892ccc5f0..c7c44c9b0f8f2 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -38,15 +38,15 @@ namespace { // calls the device api to get the max threads per block static inline int DetectROCMmaxThreadsPerBlock() { - TVMContext tvm_ctx; - tvm_ctx.device_type = kDLROCM; - tvm_ctx.device_id = 0; - tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_ctx, true); + Device tvm_dev; + tvm_dev.device_type = kDLROCM; + tvm_dev.device_id = 0; + tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_dev, true); if (api != nullptr) { TVMRetValue val; - api->GetAttr(tvm_ctx, tvm::runtime::kExist, &val); + api->GetAttr(tvm_dev, tvm::runtime::kExist, &val); if (val.operator int() == 1) { - tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(tvm_ctx, tvm::runtime::kMaxThreadsPerBlock, + tvm::runtime::DeviceAPI::Get(tvm_dev)->GetAttr(tvm_dev, tvm::runtime::kMaxThreadsPerBlock, &val); return val.operator int(); } diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index c1af2a366a6bd..9d324d56887ff 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -564,8 +564,8 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll /* The following "kinds" are accessing the members of DLTensor: typedef struct { void* data; kArrData - DLContext ctx; kArrDeviceType (ctx.device_type) - kArrDeviceId (ctx.device_id) + DLDevice device; kArrDeviceType (device.device_type) + kArrDeviceId (device.device_id) int ndim; kArrNDim DLDataType dtype; kArrTypeCode (dtype.code) kArrTypeBits (dtype.bits) @@ -602,7 +602,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll void* v_handle; const char* v_str; TVMType v_type; - TVMContext v_ctx; + DLDevice v_device; } TVMValue; */ ICHECK_EQ(t.lanes(), 1); diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc index 694be56216069..3b4cae9197b0d 100644 --- a/src/target/llvm/codegen_params.cc +++ b/src/target/llvm/codegen_params.cc @@ -69,7 +69,7 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime:: auto arr_type = arr.DataType(); CHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays"; - CHECK_EQ(arr->ctx.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays"; + CHECK_EQ(arr->device.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays"; CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes(); diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index 55db59f8d8426..986ef8ed0749d 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -267,10 +267,10 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri os << "dtype.lanes"; break; case builtin::kArrDeviceId: - os << "ctx.device_id"; + os << "device.device_id"; break; case builtin::kArrDeviceType: - os << "ctx.device_type"; + os << "device.device_type"; break; default: LOG(FATAL) << "unknown field code"; diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 863d99993f4a9..08842554257b8 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -103,7 +103,7 @@ static int ExtractIntWithPrefix(const std::string& str, const std::string& prefi * \param val The detected value * \return A boolean indicating if detection succeeds */ -static bool DetectDeviceFlag(TVMContext device, runtime::DeviceAttrKind flag, TVMRetValue* val) { +static bool DetectDeviceFlag(Device device, runtime::DeviceAttrKind flag, TVMRetValue* val) { using runtime::DeviceAPI; DeviceAPI* api = DeviceAPI::Get(device, true); // Check if compiled with the corresponding device api diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc index 74d1a19d2cfe7..377ad5c7a40a3 100644 --- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc +++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc @@ -1089,8 +1089,8 @@ Stmt SchedulePostProcRewriteForTensorCore(Stmt stmt, Schedule schedule, } // Check if current runtime support GPU CUDA - TVMContext ctx{kDLGPU, 0}; - auto api = tvm::runtime::DeviceAPI::Get(ctx, true); + Device dev{kDLGPU, 0}; + auto api = tvm::runtime::DeviceAPI::Get(dev, true); if (api == nullptr) { return stmt; } diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 1d12d57d10b42..a4488c8986e65 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -128,11 +128,11 @@ class BuiltinLower : public StmtExprMutator { } Stmt VisitStmt_(const AttrStmtNode* op) final { - if (op->attr_key == attr::device_context_id) { + if (op->attr_key == attr::device_id) { ICHECK(!device_id_.defined()); device_id_ = op->value; return this->VisitStmt(op->body); - } else if (op->attr_key == attr::device_context_type) { + } else if (op->attr_key == attr::device_type) { ICHECK(!device_type_.defined()); device_type_ = op->value; return this->VisitStmt(op->body); diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 3842f3e9a8ee5..0cc0086897d8c 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -249,10 +249,10 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { // Set device context if (vmap.count(device_id.get())) { PrimExpr node = StringImm("default"); - seq_check.push_back(AttrStmt(node, attr::device_context_id, device_id, nop)); - seq_check.push_back(AttrStmt(node, attr::device_context_type, device_type, nop)); + seq_check.push_back(AttrStmt(node, attr::device_id, device_id, nop)); + seq_check.push_back(AttrStmt(node, attr::device_type, device_type, nop)); - if (runtime::DeviceAPI::NeedSetDeviceContext(target_device_type)) { + if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) { Stmt set_device = Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), {StringImm(runtime::symbol::tvm_set_device), device_type, device_id})); diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc index 35fd5b1c45b12..41632ff8d5614 100644 --- a/tests/cpp/container_test.cc +++ b/tests/cpp/container_test.cc @@ -676,7 +676,7 @@ TEST(Optional, PackedCall) { ICHECK_EQ(args[0].type_code(), tcode); }); String s = "xyz"; - auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLContext{kDLCPU, 0}); + auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLDevice{kDLCPU, 0}); test_ffi(Optional(nd), static_cast(kTVMNDArrayHandle)); test_ffi(Optional(s), static_cast(kTVMObjectRValueRefArg)); test_ffi(s, static_cast(kTVMObjectHandle)); diff --git a/tests/cpp/contrib/bnns.cc b/tests/cpp/contrib/bnns.cc index 1efd487caff9f..f7d40f176fb69 100644 --- a/tests/cpp/contrib/bnns.cc +++ b/tests/cpp/contrib/bnns.cc @@ -56,7 +56,7 @@ TEST(PackedFunc, Node) { TEST(PackedFunc, NDArray) { using namespace tvm; using namespace tvm::runtime; - auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0}); reinterpret_cast(x->data)[0] = 10.0f; ICHECK(x.use_count() == 1); @@ -191,7 +191,7 @@ TEST(PackedFunc, ObjectConversion) { using namespace tvm::tir; using namespace tvm::runtime; TVMRetValue rv; - auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0}); // assign null rv = ObjectRef(); ICHECK_EQ(rv.type_code(), kTVMNullptr); diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 53a3f40388cbf..cf22577a791a6 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -64,7 +64,7 @@ TEST(PackedFunc, Node) { TEST(PackedFunc, NDArray) { using namespace tvm; using namespace tvm::runtime; - auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0}); reinterpret_cast(x->data)[0] = 10.0f; ICHECK(x.use_count() == 1); @@ -199,7 +199,7 @@ TEST(PackedFunc, ObjectConversion) { using namespace tvm::tir; using namespace tvm::runtime; TVMRetValue rv; - auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0}); // assign null rv = ObjectRef(); ICHECK_EQ(rv.type_code(), kTVMNullptr); diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling.cc index 6ec2fc060f9fc..f770bfda8e5bb 100644 --- a/tests/cpp/profiling.cc +++ b/tests/cpp/profiling.cc @@ -27,11 +27,11 @@ namespace tvm { namespace runtime { TEST(DefaultTimer, Basic) { using namespace tvm::runtime; - DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; + Device dev; + dev.device_type = kDLCPU; + dev.device_id = 0; - Timer t = Timer::Start(ctx); + Timer t = Timer::Start(dev); std::this_thread::sleep_for(std::chrono::milliseconds(10)); t->Stop(); int64_t elapsed = t->SyncAndGetElapsedNanos(); diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index a15cdcd3926b7..74e5289c5093b 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -123,10 +123,10 @@ TEST(Relay, BuildModule) { std::string json = json_f(); tvm::runtime::Module mod = mod_f(); // run - auto ctx = A->ctx; + auto dev = A->device; auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create"); ICHECK(mod.defined()) << "Module must be defined"; - tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)ctx.device_type, (int)ctx.device_id); + tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)dev.device_type, (int)dev.device_id); auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false); auto run_f = run_mod.GetFunction("run", false); auto get_output_f = run_mod.GetFunction("get_output", false); diff --git a/tests/crt/memory_test.cc b/tests/crt/memory_test.cc index 101a5f0083949..d876e5c96da96 100644 --- a/tests/crt/memory_test.cc +++ b/tests/crt/memory_test.cc @@ -40,7 +40,7 @@ class MemoryManagerTest : public ::testing::Test { MemoryManagerCreate(&interface, memory_pool, kMemoryPoolSizeBytes, kPageSizeBytesLog); mgr = (MemoryManager*)interface; ASSERT_EQ(kNumUsablePages, mgr->ptable.max_pages); - ctx_ = {kDLCPU, 0}; + dev_ = {kDLCPU, 0}; } unsigned int AddressToPageNumber(void* a) { @@ -52,7 +52,7 @@ class MemoryManagerTest : public ::testing::Test { uint8_t* memory_pool; MemoryManagerInterface* interface; MemoryManager* mgr; - DLContext ctx_; + DLDevice dev_; }; #define EXPECT_PAGE(expected, actual) EXPECT_EQ(expected, AddressToPageNumber(actual)) @@ -64,7 +64,7 @@ TEST_F(MemoryManagerTest, AllocFreeFifo) { void* ptrs[kNumUsablePages]; for (size_t idx = 0; idx < kNumUsablePages; idx++) { void* a; - EXPECT_EQ(interface->Allocate(interface, 1, ctx_, &a), kTvmErrorNoError); + EXPECT_EQ(interface->Allocate(interface, 1, dev_, &a), kTvmErrorNoError); if (i == 0) { EXPECT_PAGE(idx, a); } else { @@ -75,7 +75,7 @@ TEST_F(MemoryManagerTest, AllocFreeFifo) { } for (int idx = kNumUsablePages - 1; idx >= 0; idx--) { - interface->Free(interface, ptrs[idx], ctx_); + interface->Free(interface, ptrs[idx], dev_); EXPECT_EQ(interface->vleak_size, idx); } } diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py index 4c8bd5f5dae84..1e7eed7b906a5 100644 --- a/tests/micro/qemu/test_zephyr.py +++ b/tests/micro/qemu/test_zephyr.py @@ -131,11 +131,11 @@ def test_compile_runtime(platform, west_cmd): # NOTE: run test in a nested function so cPython will delete arrays before closing the session. def test_basic_add(sess): - A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device) assert (A_data.asnumpy() == np.array([2, 3])).all() - B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context) + B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device) assert (B_data.asnumpy() == np.array([4])).all() - C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context) + C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device) assert (C_data.asnumpy() == np.array([0, 0])).all() system_lib = sess.get_system_lib() @@ -153,16 +153,16 @@ def test_platform_timer(platform, west_cmd): # NOTE: run test in a nested function so cPython will delete arrays before closing the session. def test_basic_add(sess): - A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device) assert (A_data.asnumpy() == np.array([2, 3])).all() - B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context) + B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device) assert (B_data.asnumpy() == np.array([4])).all() - C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context) + C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device) assert (C_data.asnumpy() == np.array([0, 0])).all() system_lib = sess.get_system_lib() time_eval_f = system_lib.time_evaluator( - "add", sess.context, number=20, repeat=3, min_repeat_ms=40 + "add", sess.device, number=20, repeat=3, min_repeat_ms=40 ) result = time_eval_f(A_data, B_data, C_data) assert (C_data.asnumpy() == np.array([6, 7])).all() @@ -191,7 +191,7 @@ def test_relay(platform, west_cmd): with _make_session(model, target, zephyr_board, west_cmd, mod) as session: graph_mod = tvm.micro.create_local_graph_runtime( - graph, session.get_system_lib(), session.context + graph, session.get_system_lib(), session.device ) graph_mod.set_input(**params) x_in = np.random.randint(10, size=shape[0], dtype=dtype) @@ -264,7 +264,7 @@ def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape with _make_session(model, target, zephyr_board, west_cmd, mod) as session: rt_mod = tvm.micro.create_local_graph_runtime( - graph, session.get_system_lib(), session.context + graph, session.get_system_lib(), session.device ) rt_mod.set_input(**params) for name, data in map_inputs.items(): @@ -276,7 +276,7 @@ def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape results = result if isinstance(result, list) else [result] for idx, shape in enumerate(out_shapes): - out = tvm.nd.empty(shape, ctx=session.context) + out = tvm.nd.empty(shape, device=session.device) out = rt_mod.get_output(idx, out) tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL) diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c index e04fc20508b49..bcd82b41c1813 100644 --- a/tests/micro/qemu/zephyr-runtime/src/main.c +++ b/tests/micro/qemu/zephyr-runtime/src/main.c @@ -71,12 +71,12 @@ void TVMPlatformAbort(tvm_crt_error_t error) { K_MEM_POOL_DEFINE(tvm_memory_pool, 64, 1024, 120, 4); -tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) { +tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { *out_ptr = k_mem_pool_malloc(&tvm_memory_pool, num_bytes); return (*out_ptr == NULL) ? kTvmErrorPlatformNoMemory : kTvmErrorNoError; } -tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) { +tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { k_free(ptr); return kTvmErrorNoError; } diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py index 9861a1c39740e..c2097bb15b521 100644 --- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py +++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py @@ -55,12 +55,12 @@ def check_llvm(): binds = {A: Ab} # BUILD and invoke the kernel. f = tvm.build(s, [A, B, C], "llvm", binds=binds) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -96,10 +96,10 @@ def check_llvm(use_file): s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code) # BUILD and invoke the kernel. f = tvm.build(s, [A, B], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0) diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py index bd9fb738ba7b7..00ca0b1af3cf9 100644 --- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py +++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py @@ -24,19 +24,19 @@ @tvm.testing.uses_gpu def test_nd_create(): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]: x = np.random.randint(0, 10, size=(3, 4)) x = np.array(x, dtype=dtype) - y = tvm.nd.array(x, ctx=ctx) - z = y.copyto(ctx) + y = tvm.nd.array(x, device=dev) + z = y.copyto(dev) assert y.dtype == x.dtype assert y.shape == x.shape assert isinstance(y, tvm.nd.NDArray) np.testing.assert_equal(x, y.asnumpy()) np.testing.assert_equal(x, z.asnumpy()) # no need here, just to test usablity - ctx.sync() + dev.sync() def test_fp16_conversion(): diff --git a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py index c6efbb472c4a4..9318b0cc17832 100644 --- a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py +++ b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py @@ -99,15 +99,15 @@ def myfunc(ss): tvm.runtime.convert(myfunc)(x) -def test_ctx(): - def test_ctx_func(ctx): - assert tvm.gpu(7) == ctx +def test_device(): + def test_device_func(dev): + assert tvm.gpu(7) == dev return tvm.cpu(0) - x = test_ctx_func(tvm.gpu(7)) + x = test_device_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) - x = tvm.testing.context_test(x, x.device_type, x.device_id) + x = tvm.testing.device_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10) @@ -163,4 +163,4 @@ def check(arr): test_convert() test_return_func() test_byte_array() - test_ctx() + test_device() diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py index 86f98eb6e8de0..7c8dfa6b5e9a8 100644 --- a/tests/python/contrib/test_bnns/test_onnx_topologies.py +++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py @@ -115,10 +115,10 @@ def run(mod, target, simplify=True, with_bnns=False): path_dso = temp.relpath(lib_name) graph_module.export_library(path_dso) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module(path_dso) - module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + module = graph_runtime.GraphModule(loaded_lib["default"](dev)) module.run() return module.get_output(0).asnumpy() diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py index dd9f7775a7ac0..946d93385e6fb 100644 --- a/tests/python/contrib/test_cblas.py +++ b/tests/python/contrib/test_cblas.py @@ -49,11 +49,11 @@ def verify(target="llvm"): if not tvm.get_global_func(lib.__name__ + ".matmul", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) - a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose( @@ -119,11 +119,11 @@ def verify(target="llvm"): if not tvm.get_global_func("tvm.contrib.mkl.matmul_u8s8s32", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) - a = tvm.nd.array(np.random.randint(low=0, high=50, size=ashape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.randint(low=0, high=50, size=bshape).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.randint(low=0, high=50, size=ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.randint(low=0, high=50, size=bshape).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) bb = 10 f(a, b, d, bb) tvm.testing.assert_allclose( @@ -171,11 +171,11 @@ def verify(target="llvm"): if not tvm.get_global_func(lib.__name__ + ".matmul", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, D], target) - a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((batch, n, m), dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros((batch, n, m), dtype=D.dtype), dev) f(a, b, d) tvm.testing.assert_allclose( d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), transa, transb), rtol=1e-5 diff --git a/tests/python/contrib/test_coreml_codegen.py b/tests/python/contrib/test_coreml_codegen.py index dd10b6d9fcbdd..f8baf9f3530d5 100644 --- a/tests/python/contrib/test_coreml_codegen.py +++ b/tests/python/contrib/test_coreml_codegen.py @@ -99,13 +99,13 @@ def test_annotate(): @pytest.mark.skipif(not _has_xcode(), reason="Xcode is not available") def test_compile_and_run(): - ctx = tvm.cpu() + dev = tvm.cpu() target = "llvm" tol = 1e-3 with relay.build_config(opt_level=3): lib = relay.build(_create_graph_annotated(), target=target) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx)) + m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) shape = (10, 10) x_data = np.random.rand(*shape).astype("float32") @@ -114,7 +114,7 @@ def test_compile_and_run(): m.set_input("x", x_data) m.set_input("y", y_data) m.run() - out = tvm.nd.empty(shape, ctx=ctx) + out = tvm.nd.empty(shape, device=dev) out = m.get_output(0, out) expected = (y_data * y_data) - (x_data + x_data) diff --git a/tests/python/contrib/test_coreml_runtime.py b/tests/python/contrib/test_coreml_runtime.py index c0076d6eb12fd..447b412595c30 100644 --- a/tests/python/contrib/test_coreml_runtime.py +++ b/tests/python/contrib/test_coreml_runtime.py @@ -56,7 +56,7 @@ def create_coreml_model(): ) return coremltools.models.MLModel(builder.spec) - def verify(coreml_model, model_path, ctx): + def verify(coreml_model, model_path, dev): coreml_model = create_coreml_model() out_spec = coreml_model.output_description._fd_spec @@ -72,9 +72,9 @@ def verify(coreml_model, model_path, ctx): coreml_outputs = [coreml_model.predict(inputs)[name] for name in out_names] # inference via tvm coreml runtime - runtime = coreml_runtime.create("main", model_path, ctx) + runtime = coreml_runtime.create("main", model_path, dev) for name in inputs: - runtime.set_input(name, tvm.nd.array(inputs[name], ctx)) + runtime.set_input(name, tvm.nd.array(inputs[name], dev)) runtime.invoke() tvm_outputs = [runtime.get_output(i).asnumpy() for i in range(runtime.get_num_outputs())] @@ -89,14 +89,14 @@ def check_remote(coreml_model): ) compiled_model = os.path.basename(compiled_model) remote = rpc.connect(proxy_host, proxy_port, key=key) - ctx = remote.cpu(0) - verify(coreml_model, compiled_model, ctx) + dev = remote.cpu(0) + verify(coreml_model, compiled_model, dev) def check_local(coreml_model): temp = utils.tempdir() compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir) - ctx = tvm.cpu(0) - verify(coreml_model, compiled_model, ctx) + dev = tvm.cpu(0) + verify(coreml_model, compiled_model, dev) coreml_model = create_coreml_model() check_remote(coreml_model) diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py index 175a747bba428..c4e6f89deadc7 100644 --- a/tests/python/contrib/test_cublas.py +++ b/tests/python/contrib/test_cublas.py @@ -35,11 +35,11 @@ def verify(target="cuda"): if not tvm.get_global_func("tvm.contrib.cublas.matmul", True): print("skip because extern function is not available") return - ctx = tvm.gpu(0) + dev = tvm.gpu(0) f = tvm.build(s, [A, B, C], target) - a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol @@ -70,7 +70,7 @@ def verify(target="cuda"): if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True): print("skip because extern function is not available") return - ctx = tvm.gpu(0) + dev = tvm.gpu(0) f = tvm.build(s, [A, B, C], target) a_old = np.random.uniform(0, 128, size=(n, l)) b_old = np.random.uniform(0, 128, size=(l, m)) @@ -95,9 +95,9 @@ def verify(target="cuda"): ) b_new = b_new.reshape([m, L]) - a = tvm.nd.array(a_new.astype(A.dtype), ctx) - b = tvm.nd.array(b_new.astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), ctx) + a = tvm.nd.array(a_new.astype(A.dtype), dev) + b = tvm.nd.array(b_new.astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), dev) f(a, b, c) # Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout c_out = c.asnumpy() @@ -126,11 +126,11 @@ def verify(target="cuda"): if not tvm.get_global_func("tvm.contrib.cublas.matmul", True): print("skip because extern function is not available") return - ctx = tvm.gpu(0) + dev = tvm.gpu(0) f = tvm.build(s, [A, B, C], target) - a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py index 514f529b46923..390179a98293c 100644 --- a/tests/python/contrib/test_cudnn.py +++ b/tests/python/contrib/test_cudnn.py @@ -71,14 +71,14 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1): s = te.create_schedule(Y.op) # validation - ctx = tvm.gpu(0) + dev = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) - x = tvm.nd.array(x_np, ctx) - w = tvm.nd.array(w_np, ctx) - y = tvm.nd.array(y_np, ctx) + x = tvm.nd.array(x_np, dev) + w = tvm.nd.array(w_np, dev) + y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups) elif tensor_format == 1: @@ -149,14 +149,14 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1): s = te.create_schedule(Y.op) # validation - ctx = tvm.gpu(0) + dev = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) - x = tvm.nd.array(x_np, ctx) - w = tvm.nd.array(w_np, ctx) - y = tvm.nd.array(y_np, ctx) + x = tvm.nd.array(x_np, dev) + w = tvm.nd.array(w_np, dev) + y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups) else: @@ -177,11 +177,11 @@ def verify_softmax(shape, axis, dtype="float32"): B = cudnn.softmax(A, axis) s = te.create_schedule([B.op]) - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=shape).astype(dtype) b_np = tvm.topi.testing.softmax_python(a_np) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax") f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3) @@ -192,13 +192,13 @@ def verify_softmax_4d(shape, dtype="float32"): B = cudnn.softmax(A, axis=1) s = te.create_schedule([B.op]) - ctx = tvm.gpu(0) + dev = tvm.gpu(0) n, c, h, w = shape a_np = np.random.uniform(size=shape).astype(dtype) b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h * w, c)) b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax") f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3) diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py index 8c6113cee3d47..a3a01604bed14 100644 --- a/tests/python/contrib/test_edgetpu_runtime.py +++ b/tests/python/contrib/test_edgetpu_runtime.py @@ -67,11 +67,11 @@ def check_remote(target_edgetpu=False): # inference via remote tvm tflite runtime server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) - ctx = remote.cpu(0) + dev = remote.cpu(0) with open(tflite_model_path, "rb") as model_fin: - runtime = tflite_runtime.create(model_fin.read(), ctx) - runtime.set_input(0, tvm.nd.array(tflite_input, ctx)) + runtime = tflite_runtime.create(model_fin.read(), dev) + runtime.set_input(0, tvm.nd.array(tflite_input, dev)) runtime.invoke() out = runtime.get_output(0) np.testing.assert_equal(out.asnumpy(), tflite_output) diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index cd9e9e91292d6..791ef4a2a1059 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -221,7 +221,7 @@ def run(lib, inputs, outputs, npu=True): def build_and_run( - mod, inputs, outputs, params, ctx=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1 + mod, inputs, outputs, params, device=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1 ): lib = build(mod, params, npu, expected_host_ops, npu_partitions) return run(lib, inputs, outputs, npu) diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py index 5475978cb3cc6..4d2ed795d6e09 100644 --- a/tests/python/contrib/test_gemm_acc16.py +++ b/tests/python/contrib/test_gemm_acc16.py @@ -38,7 +38,7 @@ def verify(target="llvm -mcpu=skylake-avx512"): print("skip because %s is not enabled..." % target) return - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) X = te.placeholder((m, k), name="X", dtype="uint8") W = te.placeholder((n, k), name="W", dtype="int8") pc = dot_16x1x16_uint8_int8_int16() @@ -69,7 +69,7 @@ def verify(target="llvm -mcpu=skylake-avx512"): t_sch[t_fc].tensorize(a_yi, pc) # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True)) t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") - t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10) + t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10) # generate the plain data a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") @@ -84,9 +84,9 @@ def verify(target="llvm -mcpu=skylake-avx512"): s_idx // 128 * 2 + t_idx ] - x = tvm.nd.array(a_, ctx) - w = tvm.nd.array(packW, ctx) - y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx) + x = tvm.nd.array(a_, dev) + w = tvm.nd.array(packW, dev) + y = tvm.nd.array(np.zeros((m, n), dtype="int16"), dev) result = t_evaluator(x, w, y) gops_per_sec = gops_per_mm / result.mean / 1e9 diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py index 3e0d5db953799..02538e88c39ec 100644 --- a/tests/python/contrib/test_gemm_acc32_vnni.py +++ b/tests/python/contrib/test_gemm_acc32_vnni.py @@ -48,7 +48,7 @@ def verify(target="llvm -mcpu=cascadelake"): print("skip because %s is not enabled..." % target) return - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) pc = dot_16x1x16_uint8_int8_int32_cascadelake() ak = te.reduce_axis((0, k), name="k") packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8") @@ -76,7 +76,7 @@ def verify(target="llvm -mcpu=cascadelake"): t_sch[t_fc].tensorize(a_yi, pc) t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") - t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10) + t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10) # generate the plain data a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") @@ -91,9 +91,9 @@ def verify(target="llvm -mcpu=cascadelake"): (s_idx // 16) * 4 + t_idx ] - x = tvm.nd.array(a_, ctx) - w = tvm.nd.array(packW, ctx) - y = tvm.nd.array(np.zeros((m, n), dtype="int32"), ctx) + x = tvm.nd.array(a_, dev) + w = tvm.nd.array(packW, dev) + y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev) result = t_evaluator(x, w, y) gops_per_sec = gops_per_mm / result.mean / 1e9 diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py index 317736725d086..4847c0e1b7bc9 100644 --- a/tests/python/contrib/test_miopen.py +++ b/tests/python/contrib/test_miopen.py @@ -52,11 +52,11 @@ def test_conv2d(): s = te.create_schedule(Y.op) def verify(): - ctx = tvm.rocm(0) + dev = tvm.rocm(0) f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d") - x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx) - w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx) - y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) + x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), dev) + w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), dev) + y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev) f(x, w, y) Y_ref = topi.nn.conv2d_nchw( @@ -64,7 +64,7 @@ def verify(): ) s_ref = te.create_schedule(Y_ref.op) f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm", target_host="llvm") - y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) + y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev) f_ref(x, w, y_ref) print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy()))) tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3) diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py index 75a76f02ab40e..597e87778866d 100644 --- a/tests/python/contrib/test_mps.py +++ b/tests/python/contrib/test_mps.py @@ -47,11 +47,11 @@ def verify(A, B, D, s, target="metal"): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not available") return - ctx = tvm.metal(0) + dev = tvm.metal(0) f = tvm.build(s, [A, B, D], "metal") - a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5) @@ -77,11 +77,11 @@ def verify(A, B, C, target="llvm"): if not tvm.get_global_func("tvm.contrib.mps.conv2d", True): print("skip because extern function is not available") return - ctx = tvm.metal(0) + dev = tvm.metal(0) f = tvm.build(s1, [A, B, C], "metal") - a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev) f(a, b, c) # print(c.asnumpy()) # print(c.shape) diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py index afe739c47dc5b..308bd82988efa 100644 --- a/tests/python/contrib/test_mxnet_bridge.py +++ b/tests/python/contrib/test_mxnet_bridge.py @@ -47,10 +47,10 @@ def mxnet_check(): # get a mxnet version mxf = to_mxnet_func(f, const_loc=[0, 1]) - ctx = mx.gpu(0) - xx = mx.nd.uniform(shape=shape, ctx=ctx) - yy = mx.nd.uniform(shape=shape, ctx=ctx) - zz = mx.nd.empty(shape=shape, ctx=ctx) + dev = mx.gpu(0) + xx = mx.nd.uniform(shape=shape, device=dev) + yy = mx.nd.uniform(shape=shape, device=dev) + zz = mx.nd.empty(shape=shape, device=dev) # invoke myf: this runs in mxnet engine mxf(xx, yy, zz, 10.0) diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py index bcb4358596b26..0208d72119605 100644 --- a/tests/python/contrib/test_nnpack.py +++ b/tests/python/contrib/test_nnpack.py @@ -42,11 +42,11 @@ def verify(target="llvm"): if not nnpack.is_available(): pytest.skip("nnpack is not available") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) - a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros((m,), dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros((m,), dtype=D.dtype), dev) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5) @@ -111,7 +111,7 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias= if not nnpack.is_available(): pytest.skip("nnpack is not available") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) output = nnpack.convolution_inference( data, kernel, @@ -127,10 +127,10 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias= na = np.random.uniform(size=dshape).astype(data.dtype) nb = np.random.uniform(size=kshape).astype(kernel.dtype) nc = np.zeros(bshape, dtype=bias.dtype) - ta = tvm.nd.array(na, ctx) - tb = tvm.nd.array(nb, ctx) - tc = tvm.nd.array(nc, ctx) - td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx) + ta = tvm.nd.array(na, dev) + tb = tvm.nd.array(nb, dev) + tc = tvm.nd.array(nc, dev) + td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev) f(ta, tb, tc, td) nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape( 1, bshape[0], 1, 1 @@ -177,7 +177,7 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias= if not nnpack.is_available(): pytest.skip("nnpack is not available") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) transformed_kernel = nnpack.convolution_inference_weight_transform( kernel, algorithm=algorithm ) @@ -201,10 +201,10 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias= if with_bias else np.zeros(bshape, dtype=bias.dtype) ) - ta = tvm.nd.array(na, ctx) - tb = tvm.nd.array(nb, ctx) - tc = tvm.nd.array(nc, ctx) - td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx) + ta = tvm.nd.array(na, dev) + tb = tvm.nd.array(nb, dev) + tc = tvm.nd.array(nc, dev) + td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev) f(ta, tb, tc, td) nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape( 1, bshape[0], 1, 1 diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py index 6e9cf3afd5ba7..9b29b33caaf61 100644 --- a/tests/python/contrib/test_onnx.py +++ b/tests/python/contrib/test_onnx.py @@ -48,8 +48,8 @@ def run_onnx(onnx_model, input_data): def run_relay(func, data_tuple): target = "llvm" - ctx = tvm.context("llvm", 0) - intrp = relay.create_executor("graph", ctx=ctx, target=target) + dev = tvm.device("llvm", 0) + intrp = relay.create_executor("graph", device=dev, target=target) relay_res = intrp.evaluate(func)(*data_tuple) result = [] diff --git a/tests/python/contrib/test_onnx_model.py b/tests/python/contrib/test_onnx_model.py index a3f3717e3872f..addb137325509 100644 --- a/tests/python/contrib/test_onnx_model.py +++ b/tests/python/contrib/test_onnx_model.py @@ -58,8 +58,8 @@ def get_data(in_data_shapes, dtype="float32"): def run_relay(mod, params, in_data): target = "llvm" - ctx = tvm.context("llvm", 0) - intrp = relay.create_executor("graph", mod, ctx=ctx, target=target) + dev = tvm.device("llvm", 0) + intrp = relay.create_executor("graph", mod, device=dev, target=target) in_data = [tvm.nd.array(value) for value in in_data.values()] return intrp.evaluate()(*in_data, **params).asnumpy() diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py index fd87a065fffaf..0740521b5fa54 100644 --- a/tests/python/contrib/test_random.py +++ b/tests/python/contrib/test_random.py @@ -35,9 +35,9 @@ def verify(target="llvm"): if not tvm.get_global_func("tvm.contrib.random.randint", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A], target) - a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx) + a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.asnumpy() assert abs(np.mean(na)) < 0.3 @@ -60,9 +60,9 @@ def verify(target="llvm"): if not tvm.get_global_func("tvm.contrib.random.uniform", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A], target) - a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx) + a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.asnumpy() assert abs(np.mean(na) - 0.5) < 1e-1 @@ -85,9 +85,9 @@ def verify(target="llvm"): if not tvm.get_global_func("tvm.contrib.random.normal", True): print("skip because extern function is not available") return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A], target) - a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx) + a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.asnumpy() assert abs(np.mean(na) - 3) < 1e-1 @@ -98,12 +98,12 @@ def verify(target="llvm"): @tvm.testing.uses_gpu def test_random_fill(): - def test_local(ctx, dtype): + def test_local(dev, dtype): if not tvm.get_global_func("tvm.contrib.random.random_fill", True): print("skip because extern function is not available") return np_ones = np.ones((512, 512), dtype=dtype) - value = tvm.nd.empty(np_ones.shape, np_ones.dtype, ctx) + value = tvm.nd.empty(np_ones.shape, np_ones.dtype, dev) random_fill = tvm.get_global_func("tvm.contrib.random.random_fill") random_fill(value) @@ -146,8 +146,8 @@ def test_rpc(dtype): "float32", "float64", ]: - for _, ctx in tvm.testing.enabled_targets(): - test_local(ctx, dtype) + for _, dev in tvm.testing.enabled_targets(): + test_local(dev, dtype) test_rpc(dtype) diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py index 6f1783daa74ca..bcbec60265d3b 100644 --- a/tests/python/contrib/test_rocblas.py +++ b/tests/python/contrib/test_rocblas.py @@ -37,11 +37,11 @@ def verify(target="rocm"): if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return - ctx = tvm.rocm(0) + dev = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) - a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) @@ -70,11 +70,11 @@ def verify(target="rocm"): if not tvm.get_global_func(lib.__name__ + ".batch_matmul", True): print("skip because extern function is not available") return - ctx = tvm.rocm(0) + dev = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) - a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), transa, transb), rtol=1e-5 diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py index a049602ac265e..cdb3a00dc4921 100644 --- a/tests/python/contrib/test_sort.py +++ b/tests/python/contrib/test_sort.py @@ -48,13 +48,13 @@ def test_sort(): [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]], ] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) target = "llvm" s = te.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) - a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) - b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) - c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) + a = tvm.nd.array(np.array(input).astype(data.dtype), dev) + b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev) + c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5) @@ -76,7 +76,7 @@ def test_sort_np(): name="sort_tensor", ) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) target = "llvm" s = te.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) @@ -84,9 +84,9 @@ def test_sort_np(): np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) - a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) - b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) - c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) + a = tvm.nd.array(np.array(np_data).astype(data.dtype), dev) + b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev) + c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5) @@ -103,7 +103,7 @@ def test_sort_by_key_gpu(): with tvm.target.Target(target): keys_out, values_out = sort_by_key(keys, values) - ctx = tvm.context(target) + dev = tvm.device(target) s = te.create_schedule([keys_out.op, values_out.op]) f = tvm.build(s, [keys, values, keys_out, values_out], target) @@ -111,10 +111,10 @@ def test_sort_by_key_gpu(): values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32) keys_np_out = np.zeros(keys_np.shape, np.int32) values_np_out = np.zeros(values_np.shape, np.int32) - keys_in = tvm.nd.array(keys_np, ctx) - values_in = tvm.nd.array(values_np, ctx) - keys_out = tvm.nd.array(keys_np_out, ctx) - values_out = tvm.nd.array(values_np_out, ctx) + keys_in = tvm.nd.array(keys_np, dev) + values_in = tvm.nd.array(values_np, dev) + keys_out = tvm.nd.array(keys_np_out, dev) + values_out = tvm.nd.array(values_np_out, dev) f(keys_in, values_in, keys_out, values_out) ref_keys_out = np.sort(keys_np) diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py index 39cff18a52122..d9618391ce40c 100644 --- a/tests/python/contrib/test_sparse.py +++ b/tests/python/contrib/test_sparse.py @@ -27,21 +27,21 @@ def test_static_tensor(): dtype = "float32" stype = "csr" target = "llvm" - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) m = te.size_var("m") n = te.size_var("n") A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype) assert A.stype == "csr" n = 3 a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0) - a = tvmsp.array(a, ctx) + a = tvmsp.array(a, dev) A.data = te.placeholder(a.data.shape, dtype, name="A_data") Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data") binds = {A.data: Ab} C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter") s = te.create_schedule(C.op) f = tvm.build(s, [A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((n, n), dtype), ctx) + c = tvmsp.array(np.zeros((n, n), dtype), dev) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr @@ -53,7 +53,7 @@ def test_dynamic_tensor(): dtype = "float32" stype = "csr" target = "llvm" - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n") A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype) assert A.stype == "csr" @@ -61,14 +61,14 @@ def test_dynamic_tensor(): s = te.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0) - a = tvmsp.array(a, ctx) + a = tvmsp.array(a, dev) assert a.data.dtype == a.dtype Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"]) Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data") Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices") binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) + c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr @@ -80,7 +80,7 @@ def test_sparse_array_tuple(): dtype, itype = "float32", "int32" stype = "csr" target = "llvm" - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n") A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype) assert A.stype == "csr" @@ -92,22 +92,22 @@ def test_sparse_array_tuple(): source_array = a ridx, cidx = np.nonzero(source_array) data = source_array[ridx, cidx] - a_data = _nd.array(data, ctx) + a_data = _nd.array(data, dev) indices = np.nonzero(source_array)[1].astype(itype) - a_indices = _nd.array(indices, ctx) + a_indices = _nd.array(indices, dev) indptr = [0] + np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist() indptr = np.cumsum(np.array(indptr, itype)).astype(itype) - a_indptr = _nd.array(indptr, ctx) + a_indptr = _nd.array(indptr, dev) a_init = (a_data, a_indices, a_indptr) # construct tvm sparse array with tuple - a = tvmsp.array(a_init, shape=source_array.shape, ctx=ctx) + a = tvmsp.array(a_init, shape=source_array.shape, device=dev) assert a.data.dtype == a.dtype Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"]) Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data") Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices") binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) + c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index ae8214d6463c4..0e25ca24b2a60 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -97,7 +97,7 @@ def run_and_verify_func(config, target="cuda"): for k, v in input_shapes.items() if k not in is_param } - ctx = tvm.context(target) + dev = tvm.device(target) result_dict = dict() for mode in ["graph", "vm"]: @@ -110,10 +110,10 @@ def run_and_verify_func(config, target="cuda"): with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): - exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) + exec = relay.create_executor(mode, mod=mod, device=dev, target=target) else: with tvm.transform.PassContext(opt_level=3): - exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) + exec = relay.create_executor(mode, mod=mod, device=dev, target=target) if not skip_runtime_test(): result_dict[result_key] = exec.evaluate()(**input_dict, **params) @@ -143,10 +143,10 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True): with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): - exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda") else: with tvm.transform.PassContext(opt_level=3): - exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda") res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None return res @@ -198,10 +198,14 @@ def test_tensorrt_simple(): with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): - relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + relay_exec = relay.create_executor( + mode, mod=mod, device=tvm.gpu(0), target="cuda" + ) else: with tvm.transform.PassContext(opt_level=3): - relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + relay_exec = relay.create_executor( + mode, mod=mod, device=tvm.gpu(0), target="cuda" + ) if not skip_runtime_test(): result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data) @@ -243,7 +247,7 @@ def test_tensorrt_not_compatible(): mod, config = tensorrt.partition_for_tensorrt(mod) for mode in ["graph", "vm"]: with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): - exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda") if not skip_runtime_test(): results = exec.evaluate()(x_data) @@ -269,7 +273,7 @@ def compile_graph(mod, params): return graph, lib, params def run_graph(graph, lib, params): - mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) + mod_ = graph_runtime.create(graph, lib, device=tvm.gpu(0)) mod_.load_params(params) mod_.run(data=i_data) res = mod_.get_output(0) @@ -702,7 +706,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt if not skip_runtime_test(): with relay.build_config(opt_level=3): - relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") + relay_exec = relay.create_executor( + "vm", mod=mod, device=tvm.cpu(0), target="llvm" + ) for i, x_data in enumerate(x_data_list): result_arr[i][use_trt] = relay_exec.evaluate()(x_data) @@ -1172,7 +1178,7 @@ def test_tensorrt_dynamic_batch(): if not skip_runtime_test(): with relay.build_config(opt_level=3): - relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") + relay_exec = relay.create_executor("vm", mod=mod, device=tvm.cpu(0), target="llvm") for i, batch_size in enumerate(batches_to_test): result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...]) @@ -1203,7 +1209,7 @@ def test_tensorrt_dynamic_batch_conv(): if not skip_runtime_test(): with relay.build_config(opt_level=3): - relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") + relay_exec = relay.create_executor("vm", mod=mod, device=tvm.cpu(0), target="llvm") for i, batch_size in enumerate(batches_to_test): result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...], **params) @@ -1301,8 +1307,8 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray: if skip_runtime_test(): return - ctx = tvm.cpu() - vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, ctx) + dev = tvm.cpu() + vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, dev) vm.set_input("main", **{"input0": np_sample_input}) tvm_res = vm.run() @@ -1354,7 +1360,7 @@ def test_empty_subgraph(): x_data = np.random.uniform(-1, 1, x_shape).astype("float32") for mode in ["graph", "vm"]: with tvm.transform.PassContext(opt_level=3): - exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda") if not skip_runtime_test(): results = exec.evaluate()(x_data) diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py index 39d8881f40409..222dcd4694017 100644 --- a/tests/python/contrib/test_tflite_runtime.py +++ b/tests/python/contrib/test_tflite_runtime.py @@ -130,7 +130,6 @@ def test_remote(): # inference via remote tvm tflite runtime server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) - ctx = remote.cpu(0) a = remote.upload(tflite_model_path) with open(tflite_model_path, "rb") as model_fin: diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py index 4edce0d6a6424..7b4b3a3840aee 100644 --- a/tests/python/contrib/test_thrust.py +++ b/tests/python/contrib/test_thrust.py @@ -43,7 +43,7 @@ def test_stable_sort_by_key(): print("skip because thrust is not enabled...") return - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) s = te.create_schedule([keys_out.op, values_out.op]) f = tvm.build(s, [keys, values, keys_out, values_out], target) @@ -51,10 +51,10 @@ def test_stable_sort_by_key(): values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32) keys_np_out = np.zeros(keys_np.shape, np.int32) values_np_out = np.zeros(values_np.shape, np.int32) - keys_in = tvm.nd.array(keys_np, ctx) - values_in = tvm.nd.array(values_np, ctx) - keys_out = tvm.nd.array(keys_np_out, ctx) - values_out = tvm.nd.array(values_np_out, ctx) + keys_in = tvm.nd.array(keys_np, dev) + values_in = tvm.nd.array(values_np, dev) + keys_out = tvm.nd.array(keys_np_out, dev) + values_out = tvm.nd.array(values_np_out, dev) f(keys_in, values_in, keys_out, values_out) ref_keys_out = np.sort(keys_np) @@ -80,7 +80,7 @@ def test_exclusive_scan(): scan, reduction = exclusive_scan(values, return_reduction=True) s = schedule_scan([scan, reduction]) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) f = tvm.build(s, [values, scan, reduction], target) values_np = np.random.randint(0, 10, size=ishape).astype(np.int32) @@ -93,9 +93,9 @@ def test_exclusive_scan(): reduction_np_out = np.zeros(reduction_shape, np.int32) - values_in = tvm.nd.array(values_np, ctx) - values_out = tvm.nd.array(values_np_out, ctx) - reduction_out = tvm.nd.array(reduction_np_out, ctx) + values_in = tvm.nd.array(values_np, dev) + values_out = tvm.nd.array(values_np_out, dev) + reduction_out = tvm.nd.array(reduction_np_out, dev) f(values_in, values_out, reduction_out) ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np @@ -123,13 +123,13 @@ def test_inclusive_scan(): scan = scan_thrust(values, out_dtype, exclusive=False) s = tvm.te.create_schedule([scan.op]) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) f = tvm.build(s, [values, scan], target) values_np = np.random.randint(0, 10, size=ishape).astype(np.int32) values_np_out = np.zeros(values_np.shape, out_dtype) - values_in = tvm.nd.array(values_np, ctx) - values_out = tvm.nd.array(values_np_out, ctx) + values_in = tvm.nd.array(values_np, dev) + values_out = tvm.nd.array(values_np_out, dev) f(values_in, values_out) ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype) diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py index 7e4c297853d50..cf9f8bd4c6bcc 100644 --- a/tests/python/contrib/test_verilator/infrastructure.py +++ b/tests/python/contrib/test_verilator/infrastructure.py @@ -113,6 +113,6 @@ def compile_module(mod): def run_module(exe, inputs): """Run Relay module""" - ctx = tvm.cpu() - vm = runtime.vm.VirtualMachine(exe, ctx) + dev = tvm.cpu() + vm = runtime.vm.VirtualMachine(exe, dev) return vm.run(**inputs) diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py index df7836a376473..acc538822c3cd 100644 --- a/tests/python/contrib/test_vitis_ai/infrastructure.py +++ b/tests/python/contrib/test_vitis_ai/infrastructure.py @@ -145,7 +145,7 @@ def verify_result( result, tol=1e-5, target="llvm", - ctx=tvm.cpu(), + device=tvm.cpu(), params=None, dpu_target="DPUCADX8G", tvm_ops=0, @@ -154,7 +154,6 @@ def verify_result( lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops) lib = update_lib(lib) - ctx = tvm.cpu() rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu())) for name, data in map_inputs.items(): @@ -166,6 +165,6 @@ def verify_result( results = result if isinstance(result, list) else [result] for idx, shape in enumerate(out_shapes): - out = tvm.nd.empty(shape, ctx=ctx) + out = tvm.nd.empty(shape, device=device) out = rt_mod.get_output(idx, out) tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol) diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py index 030dda372cfee..64071325ef52d 100644 --- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py +++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py @@ -59,7 +59,7 @@ def test_extern_vitis_ai_resnet18(): mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1) ref_mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)) i_data = np.random.uniform(0, 1, ishape).astype(dtype) ref_res = ref_ex.evaluate()(i_data, **params) diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py index d75ecd83a2859..2e8807564239b 100644 --- a/tests/python/frontend/caffe/test_forward.py +++ b/tests/python/frontend/caffe/test_forward.py @@ -201,11 +201,11 @@ def _run_tvm(data, proto_file, blob_file): target = "llvm" target_host = "llvm" - ctx = tvm.cpu(0) + dev = tvm.cpu(0) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) dtype = "float32" - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) if isinstance(data, (tuple, list)): for idx, d in enumerate(data): m.set_input("data" + str(idx), tvm.nd.array(d.astype(dtype))) diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py index a45b86ca903fa..879613f6bd08b 100644 --- a/tests/python/frontend/caffe2/test_forward.py +++ b/tests/python/frontend/caffe2/test_forward.py @@ -26,7 +26,7 @@ import tvm.testing -def get_tvm_output(model, input_data, target, ctx, output_shape, output_dtype="float32"): +def get_tvm_output(model, input_data, target, device, output_shape, output_dtype="float32"): """ Generic function to execute and get tvm output""" # supporting multiple inputs in caffe2 in a bit tricky, # because the input names can appear at the beginning or end of model.predict_net.external_input @@ -42,7 +42,7 @@ def get_tvm_output(model, input_data, target, ctx, output_shape, output_dtype="f with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](device)) # set inputs m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype))) @@ -78,8 +78,8 @@ def verify_caffe2_forward_impl(model, data_shape, out_shape): dtype = "float32" data = np.random.uniform(size=data_shape).astype(dtype) c2_out = get_caffe2_output(model, data, dtype) - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output(model, data, target, dev, out_shape, dtype) tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5) diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py index 1d3f6c90a48eb..2c1295b28381b 100644 --- a/tests/python/frontend/coreml/test_forward.py +++ b/tests/python/frontend/coreml/test_forward.py @@ -33,11 +33,11 @@ def get_tvm_output( - func, x, params, target, ctx, out_shape=(1, 1000), input_name="image", dtype="float32" + func, x, params, target, device, out_shape=(1, 1000), input_name="image", dtype="float32" ): with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](device)) # set inputs m.set_input(input_name, tvm.nd.array(x.astype(dtype))) m.run() @@ -52,10 +52,10 @@ def run_model_checkonly(model_file, model_name="", input_name="image"): shape_dict = {input_name: x.shape} # Some Relay passes change operators on the fly. Ensuring that we generate # new graph for each target. - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): mod, params = relay.frontend.from_coreml(model, shape_dict) - tvm_output = get_tvm_output(mod["main"], x, params, target, ctx) - print(target, ctx, model_name, "prediction id: ", np.argmax(tvm_output.flat)) + tvm_output = get_tvm_output(mod["main"], x, params, target, dev) + print(target, dev, model_name, "prediction id: ", np.argmax(tvm_output.flat)) @tvm.testing.uses_gpu @@ -71,7 +71,7 @@ def test_resnet50_checkonly(): def run_tvm_graph( - coreml_model, target, ctx, input_data, input_name, output_shape, output_dtype="float32" + coreml_model, target, device, input_data, input_name, output_shape, output_dtype="float32" ): """ Generic function to compile on relay and execute on tvm """ if isinstance(input_data, list): @@ -90,7 +90,7 @@ def run_tvm_graph( from tvm.contrib import graph_runtime - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](device)) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_name): @@ -129,9 +129,9 @@ def verify_AddLayerParams(input_dim, alpha=2): name="Add", alpha=alpha, input_names=["input1", "input2"], output_name="output", mode="ADD" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype + model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype ) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -161,9 +161,9 @@ def verify_MultiplyLayerParams(input_dim, alpha): mode="MULTIPLY", ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype + model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype ) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -189,9 +189,9 @@ def verify_ConcatLayerParams(input1_dim, input2_dim): name="Concate", input_names=["input1", "input2"], output_name="output", mode="CONCAT" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype + model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype ) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -226,8 +226,8 @@ def verify_UpsampleLayerParams(input_dim, scale, mode): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -249,8 +249,8 @@ def verify_l2_normalize(input_dim, eps): builder.add_l2_normalize(name="L2", epsilon=eps, input_name="input", output_name="output") model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -279,8 +279,8 @@ def verify_lrn(input_dim, size, bias, alpha, beta): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -304,9 +304,9 @@ def verify_average(input_dim1, input_dim2, axis=0): name="MEAN", input_names=["input1", "input2"], output_name="output", mode="AVE" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype + model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype ) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -338,11 +338,11 @@ def verify_max(input_dim): name="Max", input_names=["input1", "input2", "input3"], output_name="output", mode="MAX" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( model, target, - ctx, + dev, [a_np1, a_np2, a_np3], ["input1", "input2", "input3"], b_np.shape, @@ -377,11 +377,11 @@ def verify_min(input_dim): name="Min", input_names=["input1", "input2", "input3"], output_name="output", mode="MIN" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( model, target, - ctx, + dev, [a_np1, a_np2, a_np3], ["input1", "input2", "input3"], b_np.shape, @@ -408,8 +408,8 @@ def verify_unary_sqrt(input_dim): builder.add_unary(name="sqrt", input_name="input", output_name="output", mode="sqrt") model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -427,8 +427,8 @@ def verify_unary_rsqrt(input_dim, epsilon=0): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -446,8 +446,8 @@ def verify_unary_inverse(input_dim, epsilon=0): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -465,8 +465,8 @@ def verify_unary_power(input_dim, alpha): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -482,8 +482,8 @@ def verify_unary_exp(input_dim): builder.add_unary(name="exp", input_name="input", output_name="output", mode="exp") model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -499,8 +499,8 @@ def verify_unary_log(input_dim): builder.add_unary(name="log", input_name="input", output_name="output", mode="log") model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -516,8 +516,8 @@ def verify_unary_abs(input_dim): builder.add_unary(name="abs", input_name="input", output_name="output", mode="abs") model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -535,8 +535,8 @@ def verify_unary_threshold(input_dim, alpha): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -596,8 +596,8 @@ def _verify_reduce(input_dim, mode, axis, ref_func, dtype="float32"): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5, atol=1e-5) dshapes = [[10, 10], [1, 10, 10], [1, 3, 10, 10]] @@ -634,8 +634,8 @@ def verify_reshape(input_dim, target_shape, mode): ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -666,9 +666,9 @@ def verify_split(input_dim, nOutputs): builder.add_split(name="split", input_name="input", output_names=output_names) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np], ["input"], output_shapes, [dtype] * len(output_shapes) + model, target, dev, [a_np], ["input"], output_shapes, [dtype] * len(output_shapes) ) tvm.testing.assert_allclose(out, ref_val, rtol=1e-5) @@ -721,9 +721,9 @@ def verify_image_scaler(input_dim, blue_bias=0.0, green_bias=0.0, red_bias=0.0, name="add", input_names=["input1", "input2"], output_name="output", alpha=0, mode="ADD" ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph( - model, target, ctx, [a_np, a_np], ["input1", "input2"], b_np.shape, dtype + model, target, dev, [a_np, a_np], ["input1", "input2"], b_np.shape, dtype ) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) @@ -769,8 +769,8 @@ def verify_convolution(input_dim, filter, padding): output_name="output", ) model = cm.models.MLModel(builder.spec) - for target, ctx in tvm.testing.enabled_targets(): - out = run_tvm_graph(model, target, ctx, [a_np], ["input1"], output_shape=None) + for target, dev in tvm.testing.enabled_targets(): + out = run_tvm_graph(model, target, dev, [a_np], ["input1"], output_shape=None) tvm.testing.assert_allclose(out, b_np, rtol=1e-5) diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py index 1535c3a1b88f1..72457bf5cd469 100644 --- a/tests/python/frontend/darknet/test_forward.py +++ b/tests/python/frontend/darknet/test_forward.py @@ -79,8 +79,8 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None): lib = relay.build(mod, target, params=params) # Execute on TVM - ctx = tvm.cpu(0) - m = graph_runtime.GraphModule(lib["default"](ctx)) + dev = tvm.cpu(0) + m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(data.astype(dtype))) if states: diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py index 561e444f077fc..bdd7d7b73d53e 100644 --- a/tests/python/frontend/keras/test_forward.py +++ b/tests/python/frontend/keras/test_forward.py @@ -84,12 +84,12 @@ def verify_keras_frontend(keras_model, need_transpose=True, layout="NCHW"): def get_keras_output(xs, dtype="float32"): return keras_model.predict(xs) - def get_tvm_output(xs, target, ctx, dtype="float32"): + def get_tvm_output(xs, target, dev, dtype="float32"): shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)} mod, params = relay.frontend.from_keras(keras_model, shape_dict, layout=layout) with tvm.transform.PassContext(opt_level=2): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) for name, x in zip(keras_model.input_names, xs): m.set_input(name, tvm.nd.array(x.astype(dtype))) m.run() @@ -104,9 +104,9 @@ def to_channels_last(arr): xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes] keras_out = get_keras_output(xs) keras_out = keras_out if isinstance(keras_out, list) else [keras_out] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): inputs = [to_channels_first(x) for x in xs] if need_transpose else xs - tvm_out = get_tvm_output(inputs, target, ctx) + tvm_out = get_tvm_output(inputs, target, dev) for kout, tout in zip(keras_out, tvm_out): if need_transpose: tout = to_channels_last(tout) diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py index 3a299618fbb84..19b31d6fd3ed4 100644 --- a/tests/python/frontend/mxnet/test_forward.py +++ b/tests/python/frontend/mxnet/test_forward.py @@ -68,7 +68,7 @@ def get_mxnet_output(symbol, x, dtype="float32"): args, auxs = mod.get_params() return out, args, auxs - def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"): + def get_tvm_output(symbol, x, args, auxs, target, dev, dtype="float32"): shape_dict = {"data": x.shape} if gluon_impl: mod, params = relay.frontend.from_mxnet(symbol, shape_dict) @@ -78,7 +78,7 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"): ) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) m.run() @@ -90,14 +90,14 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"): x = np.random.uniform(size=data_shape) if gluon_impl: gluon_out, gluon_sym = get_gluon_output(name, x) - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output(gluon_sym, x, None, None, target, dev, dtype) tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5) else: mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype) assert "data" not in args - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, dev, dtype) tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5) @@ -333,9 +333,9 @@ def test_forward_where(): mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy() mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(np_cond, np_x, np_y) tvm.testing.assert_allclose(op_res.asnumpy(), mx_out) @@ -357,9 +357,9 @@ def verify(start, stop, step): ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy() mx_sym = _mx_symbol(mx.sym, start, stop, step) mod, _ = relay.frontend.from_mxnet(mx_sym, {}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()() tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -416,9 +416,9 @@ def test_forward_broadcast_ops(): ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)]) shapes = {"a": a_shape, "b": b_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np, b_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -451,9 +451,9 @@ def test_forward_elemwise_ops(): ref_res = op(mx.nd.array(a_np), mx.nd.array(b_np)) shapes = {"a": shape, "b": shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np, b_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -500,9 +500,9 @@ def test_forward_unary_ops(): ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np)]) shapes = {"a": shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5 @@ -532,9 +532,9 @@ def test_forward_scalar_ops(): ref_res = op(mx.nd.array(a_np), b_scalar) shapes = {"a": a_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) for op in ["maximum", "minimum"]: @@ -546,9 +546,9 @@ def test_forward_scalar_ops(): ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar]) shapes = {"a": a_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -560,9 +560,9 @@ def verify(shape, axis, begin, end): ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end) mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -585,9 +585,9 @@ def verify(x_shape, y_shape, axes): ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes) mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np, y_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -619,9 +619,9 @@ def verify(shape, seq_lengths, use_seq_lengths, seq_axis): mx_sym = mx.sym.SequenceReverse(*mx_sym_args) mod, _ = relay.frontend.from_mxnet(mx_sym, *from_mxnet_args) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(*in_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -655,9 +655,9 @@ def test_forward_logistic_regression_output(): ref_res = mx.nd.LogisticRegressionOutput(mx.nd.array(data_np), mx.nd.array(label_np)) shapes = {"data": data_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -672,9 +672,9 @@ def verify(a_shape, b_shape, transpose_b=False): ref_res = mx.nd.dot(mx.nd.array(a_np), mx.nd.array(b_np), transpose_b=transpose_b) shapes = {"a": a_shape, "b": b_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np, b_np) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-05, atol=1e-05 @@ -691,9 +691,9 @@ def verify(shape): ref_res = mx.nd.shape_array(mx.nd.array(x_np)) mx_sym = mx.sym.shape_array(mx.sym.var("x")) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -713,9 +713,9 @@ def verify(shape, axis): ref_res = mx.nd.squeeze(mx.nd.array(x_np), axis=axis) mx_sym = mx.sym.squeeze(mx.sym.var("x"), axis=axis) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -733,9 +733,9 @@ def verify(shape, axis, size): mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var("x"), axis, size]) ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(x_np), axis, size]) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -750,9 +750,9 @@ def verify(input_shape, shape): ref_res = mx.nd.broadcast_to(mx.nd.array(x_np), shape=shape) mx_sym = mx.sym.broadcast_to(mx.sym.var("x"), shape=shape) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": input_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -768,9 +768,9 @@ def verify(input_shape, like_shape): ref_res = mx.nd.broadcast_like(mx.nd.array(x_np), mx.nd.array(y_np)) mx_sym = mx.sym.broadcast_like(mx.sym.var("x"), mx.sym.var("y")) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": input_shape, "y": like_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np, y_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -787,9 +787,9 @@ def test_forward_logical_not(): ref_res = mx.nd.logical_not(mx.nd.array(a_np)) shapes = {"a": a_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -797,15 +797,15 @@ def test_forward_logical_not(): @tvm.testing.uses_gpu def test_forward_full(): def verify(val, shape, dtype): - ctx = mx.cpu() + dev = mx.cpu() ref_res = mx.nd.full(shape, val, dtype=dtype) mx_sym = mx.sym.full(shape, val, dtype=dtype) mod, _ = relay.frontend.from_mxnet(mx_sym, {}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # Skip testing graph runtime because this op will be optimized out # by constant folding. for kind in ["debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()() tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -827,9 +827,9 @@ def verify(data_shape, weight_shape): mx.sym.var("x"), mx.sym.var("w"), input_dim=in_dim, output_dim=out_dim ) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": data_shape, "w": weight_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x=x_np, w=w_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -854,9 +854,9 @@ def verify(shape, indices_src, axis, mode="clip"): ref_res = mx.nd.take(mx.nd.array(x_np), mx.nd.array(indices_np), axis, mode) mx_sym = mx.sym.take(mx.sym.var("x"), mx.sym.var("y"), axis, mode) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape, "y": indices_np.shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np, indices_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -878,9 +878,9 @@ def verify(xshape, yshape, y_data, error=False): mod, _ = relay.frontend.from_mxnet( mx_sym, {"x_data": xshape, "y_data": yshape}, {"x_data": "float32", "y_data": "int32"} ) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -907,9 +907,9 @@ def verify(shape, transform_type, target_shape): mx_sym = mx.sym.GridGenerator(mx.sym.var("x"), transform_type, target_shape) shape_dict = {"x": x.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5 @@ -929,9 +929,9 @@ def verify(data_shape, grid_shape): mx_sym = mx.sym.BilinearSampler(mx.sym.var("data"), mx.sym.var("grid")) shape_dict = {"data": data.shape, "grid": grid.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data, grid) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5 @@ -993,10 +993,10 @@ def verify( mx_params[name] = param._reduce() mod, params = relay.frontend.from_mxnet(mx_sym, shape=shape_dict, arg_params=mx_params) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # only test graph runtime because debug runtime is too slow for kind in ["graph"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(**inputs, **params) if init_states: assert len(op_res) == len(mx_res) @@ -1028,9 +1028,9 @@ def verify(xshape, yshape, offset=None): mx_sym = mx.sym.Crop(mx.sym.var("x"), mx.sym.var("y"), offset=offset) ref_res = mx.nd.Crop(mx.nd.array(x_data), mx.nd.array(y_data), offset=offset) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": xshape, "y": yshape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) if offset is None or offset == (0, 0): op_res = intrp.evaluate()(x_data, y_data) else: @@ -1051,9 +1051,9 @@ def verify(shape, axis, is_ascend, dtype="float32"): ref_res = mx.nd.argsort(mx.nd.array(x_np), axis=axis, is_ascend=is_ascend, dtype=dtype) mx_sym = mx.sym.argsort(mx.sym.var("x"), axis=axis, is_ascend=is_ascend, dtype=dtype) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -1082,9 +1082,9 @@ def verify(shape, k, axis, ret_type, is_ascend=None, dtype="float32"): mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype ) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) if isinstance(ref_res, list): assert len(op_res) == len(ref_res) @@ -1136,12 +1136,12 @@ def verify(shape, use_sequence_length, value, axis, dtype, itype): mx.sym.var("data"), use_sequence_length=use_sequence_length, value=value, axis=axis ) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape}, dtype={"data": dtype}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: if use_sequence_length is False and kind == "graph": # Disable the test for 'graph' when it's identity. continue - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) if use_sequence_length: op_res = intrp.evaluate()(data_np, valid_length_np) else: @@ -1161,9 +1161,9 @@ def verify(shape): ref_res = mx.nd.contrib.div_sqrt_dim(mx.nd.array(x_np)) mx_sym = mx.sym.contrib.div_sqrt_dim(mx.sym.var("x")) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -1209,9 +1209,9 @@ def verify(shape, axis=1, fix_gamma=False): } mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) # print(mod) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, gamma, beta, moving_mean, moving_var) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3) @@ -1233,9 +1233,9 @@ def verify(shape, axis=1, epsilon=1e-5): ) shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, gamma, beta) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5 @@ -1259,9 +1259,9 @@ def verify(shape, axis=-1): ) shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, gamma, beta) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1289,9 +1289,9 @@ def verify(shape, num_groups=1): ) shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, gamma, beta) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1312,9 +1312,9 @@ def verify(indices_shape, depth, on_value, off_value, dtype): mx_sym = mx.sym.one_hot(mx.sym.var("x"), depth, on_value, off_value, dtype) shape_dict = {"x": x.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x.astype("float32")) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1440,9 +1440,9 @@ def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False) ) shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, weight, bias) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3) @@ -1521,9 +1521,9 @@ def verify(data_shape, kernel_size, stride, pad, num_filter): ) shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x, weight, bias) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1556,9 +1556,9 @@ def verify(a_np, b_np): shape_dict = {"a": a_np.shape, "b": b_np.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["debug", "vm"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np, b_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3) @@ -1575,9 +1575,9 @@ def verify(from_dtype, to_dtype): shape_dict = {"x": (1, 3, 18)} dtype_dict = {"x": from_dtype} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "vm", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(from_np) assert op_res.dtype == to_dtype, op_res.dtype tvm.testing.assert_allclose(op_res.asnumpy(), from_np.astype(to_dtype)) @@ -1598,9 +1598,9 @@ def verify(dtypes, cast_narrow, expected_dtype): shape_dict[str(i)] = (1, 3, 18) dtype_dict[str(i)] = dtype mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "vm", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(*x_nps) for i, res in enumerate(op_res): assert res.dtype == expected_dtype, res.dtype @@ -1623,9 +1623,9 @@ def verify(x, shape, dtype): shapes = {"a": a_np.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "vm", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(a_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -1664,9 +1664,9 @@ def verify(shape, blocksize=2): "x": x.shape, } mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1685,9 +1685,9 @@ def verify(shape, blocksize=2): "x": x.shape, } mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1723,9 +1723,9 @@ def verify(data_shape, kernel_size, max_displacement, stride1, stride2, pad_size ) shape_dict = {"data1": data1.shape, "data2": data2.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data1, data2) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1830,9 +1830,9 @@ def verify(data_shape, start=None, step=None, axis=None): mx_sym = mx.sym.contrib.arange_like(data, **attrs) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()() tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy()) @@ -1852,9 +1852,9 @@ def verify(batch, seq_length, num_heads, head_dim): mx_sym = mx.sym.contrib.interleaved_matmul_selfatt_qk(data, heads=num_heads) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -1877,9 +1877,9 @@ def verify(batch, seq_length, num_heads, head_dim): mx_sym = mx.sym.contrib.interleaved_matmul_selfatt_valatt(data, weight, heads=num_heads) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape, "weight": weight_shape}) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data=data_np, weight=weight_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -1930,13 +1930,13 @@ def verify( ) shape_dict = {"data": data_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if tvm.contrib.thrust.can_use_thrust( tvm.target.Target(target + " -libs=thrust"), "tvm.contrib.thrust.sort" ): target += " -libs=thrust" for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -1975,9 +1975,9 @@ def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corn ) shape_dict = {"data": data_shape, "anchors": anchor_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data, anchors) tvm.testing.assert_allclose( op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 @@ -2017,9 +2017,9 @@ def verify(data_shape, axis, use_length, length): shape_dict = {"data": data_shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) if use_length: op_res = intrp.evaluate()(x, length) else: @@ -2049,7 +2049,7 @@ def verify(data_shape, axis, use_length, length): @pytest.mark.parametrize("constant_value", [0.0, 3.0]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, target, ctx, kind): +def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, target, dev, kind): data_np = np.random.uniform(size=data_shape).astype(dtype) data = mx.sym.var("data") if mode == "constant": @@ -2061,7 +2061,7 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar ref_res = np.pad(data_np, mode=mode, pad_width=pad_width) mx_sym = mx.sym.np.pad(data.as_np_ndarray(), mode=mode, pad_width=pad_width) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -2074,13 +2074,13 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar @pytest.mark.parametrize("axes", [(1, 0, 2), None]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_transpose(data_shape, axes, dtype, target, ctx, kind): +def test_forward_npi_transpose(data_shape, axes, dtype, target, dev, kind): data_np = np.random.uniform(size=data_shape).astype(dtype) data = mx.sym.var("data") ref_res = mx.np.transpose(mx.np.array(data_np), axes=axes) mx_sym = mx.sym.np.transpose(data.as_np_ndarray(), axes=axes) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2098,7 +2098,7 @@ def test_forward_npi_transpose(data_shape, axes, dtype, target, ctx, kind): @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, ctx, kind): +def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, dev, kind): data_np1 = np.random.uniform(size=data_shape1).astype(dtype) data_np2 = np.random.uniform(size=data_shape2).astype(dtype) data1 = mx.sym.var("data1") @@ -2108,7 +2108,7 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, mod, _ = relay.frontend.from_mxnet( mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype ) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np1, data_np2) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2126,7 +2126,7 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, kind): +def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, dev, kind): data_np1 = np.random.uniform(size=data_shape1).astype(dtype) data_np2 = np.random.uniform(size=data_shape2).astype(dtype) data1 = mx.sym.var("data1") @@ -2136,7 +2136,7 @@ def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, k mod, _ = relay.frontend.from_mxnet( mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype ) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np1, data_np2) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2145,13 +2145,13 @@ def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, k @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32", "bool"]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_np_copy(data_shape, dtype, target, ctx, kind): +def test_forward_np_copy(data_shape, dtype, target, dev, kind): data_np = np.random.uniform(size=data_shape).astype(dtype) data = mx.sym.var("data") ref_res = mx.np.copy(mx.np.array(data_np)) mx_sym = mx.sym.np.copy(data.as_np_ndarray()) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2173,13 +2173,13 @@ def test_forward_np_copy(data_shape, dtype, target, ctx, kind): ((2, 4, 1, 8), (-4, -3, -1, 2, -6), True), ], ) -def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, kind): +def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, dev, kind): data_np = np.random.uniform(size=data_shape).astype(dtype) data = mx.sym.var("data") ref_res = mx.npx.reshape(mx.np.array(data_np), newshape=out_shape, reverse=reverse) mx_sym = mx.sym.npx.reshape(data.as_np_ndarray(), newshape=out_shape, reverse=reverse) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2190,7 +2190,7 @@ def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_binary(data_shape, dtype, target, ctx, kind): +def test_forward_npi_binary(data_shape, dtype, target, dev, kind): ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.less] mx_ops = [ mx.sym.np.power, @@ -2214,7 +2214,7 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind): mod, _ = relay.frontend.from_mxnet( mx_sym, shape={"lhs": data_shape, "rhs": data_shape}, dtype=dtype ) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np1, data_np2) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2226,7 +2226,7 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind): @tvm.testing.parametrize_targets @pytest.mark.parametrize("scalar", [1.0, 2.0, 3.0, 4.0]) @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind): +def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, dev, kind): ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.true_divide] mx_ops = [ mx.sym.np.power, @@ -2246,7 +2246,7 @@ def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind) ref_res = ref_op(mx.np.array(data_np1), scalar) mx_sym = mx_op(data1.as_np_ndarray(), scalar) mod, _ = relay.frontend.from_mxnet(mx_sym, shape={"lhs": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np1) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2257,13 +2257,13 @@ def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind) @pytest.mark.parametrize("dtype", ["float64", "float32"]) @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) -def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind): +def test_forward_npi_tanh(data_shape, dtype, target, dev, kind): data_np1 = np.random.uniform(size=data_shape).astype(dtype) data1 = mx.sym.var("data") ref_res = mx.np.tanh(mx.np.array(data_np1)) mx_sym = mx.sym.np.tanh(data1.as_np_ndarray()) mod, _ = relay.frontend.from_mxnet(mx_sym, shape={"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np1) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2279,7 +2279,7 @@ def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind): @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) def test_forward_npi_where_rscalar( - data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, ctx, kind + data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, dev, kind ): if data_dtype == "bool": scalar = scalar == 0.0 @@ -2295,7 +2295,7 @@ def test_forward_npi_where_rscalar( mod, _ = relay.frontend.from_mxnet( mx_sym, shape={"condition": cond_shape, "x": data_shape}, dtype=dtypeDic ) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(cond_np, data_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) @@ -2313,7 +2313,7 @@ def test_forward_npi_where_rscalar( ], ) def test_forward_split_v2( - data_shape, axis, dtype, indices_or_sections, squeeze_axis, target, ctx, kind + data_shape, axis, dtype, indices_or_sections, squeeze_axis, target, dev, kind ): data_np = np.random.uniform(size=data_shape).astype(dtype) data = mx.sym.var("data") @@ -2324,7 +2324,7 @@ def test_forward_split_v2( data.as_nd_ndarray(), indices_or_sections, axis=axis, squeeze_axis=squeeze_axis ) mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np) op_res_ = [] for arr in op_res: diff --git a/tests/python/frontend/mxnet/test_qnn_ops_utils.py b/tests/python/frontend/mxnet/test_qnn_ops_utils.py index c2e242579f159..fa5c5d9715ccb 100644 --- a/tests/python/frontend/mxnet/test_qnn_ops_utils.py +++ b/tests/python/frontend/mxnet/test_qnn_ops_utils.py @@ -41,7 +41,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -120,7 +120,7 @@ def quantize_test_driver(out_dtype, quant_args, in_data, verify_output_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index ec89a3d844d10..9116b09764091 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -43,7 +43,7 @@ def get_input_data_shape_dict(graph_def, input_data): def get_tvm_output_with_vm( - graph_def, input_data, target, ctx, opset=None, freeze_params=False, convert_to_static=False + graph_def, input_data, target, device, opset=None, freeze_params=False, convert_to_static=False ): """ Generic function to execute and get tvm output with vm executor""" if not isinstance(input_data, list): @@ -57,7 +57,7 @@ def get_tvm_output_with_vm( if convert_to_static: mod = relay.transform.DynamicToStatic()(mod) - ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + ex = relay.create_executor("vm", mod=mod, device=device, target=target) result = ex.evaluate()(*input_data, **params) if isinstance(result, tvm.runtime.NDArray): return result.asnumpy() @@ -65,10 +65,12 @@ def get_tvm_output_with_vm( def get_tvm_output( - graph_def, input_data, target, ctx, output_shape=None, output_dtype="float32", opset=None + graph_def, input_data, target, device, output_shape=None, output_dtype="float32", opset=None ): """ Generic function to execute and get tvm output""" + # TODO: Resolve the issues and remove the following lines target = "llvm" + device = tvm.cpu(0) input_names, shape_dict = get_input_data_shape_dict(graph_def, input_data) @@ -76,8 +78,7 @@ def get_tvm_output( with tvm.transform.PassContext(opt_level=1): graph, lib, params = relay.build(mod, target, params=params) - ctx = tvm.cpu(0) - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, device) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_names): @@ -142,19 +143,19 @@ def verify_with_ort_with_inputs( targets = [tgt for (tgt, _) in tvm.testing.enabled_targets()] for target in targets: - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) if use_vm: tvm_out = get_tvm_output_with_vm( model, inputs, target, - ctx, + dev, opset=opset, freeze_params=freeze_params, convert_to_static=convert_to_static, ) else: - tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, dtype, opset=opset) + tvm_out = get_tvm_output(model, inputs, target, dev, out_shape, dtype, opset=opset) if not isinstance(tvm_out, list): tvm_out = [tvm_out] if not isinstance(ort_out, list): @@ -233,9 +234,9 @@ def test_reshape(): model = helper.make_model(graph, producer_name="reshape_test") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): x = np.random.uniform(size=in_shape).astype("int32") - tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32") + tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "float32") tvm.testing.assert_allclose(ref_shape, tvm_out.shape) @@ -269,9 +270,9 @@ def test_double_reshape(): model = helper.make_model(graph, producer_name="reshape_test") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): x = np.random.uniform(size=in_shape).astype("int32") - tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32") + tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "float32") tvm.testing.assert_allclose(ref_shape, tvm_out.shape) @@ -317,8 +318,8 @@ def _test_expand(name, data, shape, ref_data, dtype="int32"): model = helper.make_model(graph, producer_name=name) - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm(model, data, target, ctx, freeze_params=True) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output_with_vm(model, data, target, dev, freeze_params=True) tvm.testing.assert_allclose(ref_data, tvm_out) in_shape = (3, 1) @@ -409,9 +410,9 @@ def test_shape(): model = helper.make_model(graph, producer_name="shape_test") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): x = np.random.uniform(size=in_shape).astype("int32") - tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "int32") + tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "int32") tvm.testing.assert_allclose(ref_shape, tvm_out) @@ -438,8 +439,8 @@ def _test_power_iteration(x_shape, y_shape): model = helper.make_model(graph, producer_name="power_test") - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output(model, [x, y], target, dev, np_res.shape) tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5) @@ -1002,9 +1003,9 @@ def test_onehot(): model = helper.make_model(graph, producer_name="onehot_test") # TODO(jwfromm): Replace test against np with test against onnxrt once we update versions. - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): tvm_out = get_tvm_output_with_vm( - model, [indices_array, np.array([depth]).astype("int32"), values], target, ctx + model, [indices_array, np.array([depth]).astype("int32"), values], target, dev ) tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5) @@ -1070,7 +1071,7 @@ def test_matmul(): verify_with_ort_with_inputs(model, [a_array, b_array]) -def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx): +def verify_batch_matmul(a_shape, b_shape, out_shape, target, dev): a_array = np.random.uniform(size=a_shape).astype("float32") b_array = np.random.uniform(size=b_shape).astype("float32") @@ -1092,17 +1093,17 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx): # TODO(mbrookhart): enable cuda once VM supports heterogenous execution @tvm.testing.parametrize_targets("llvm") -def test_batch_matmul(target, ctx): - verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, ctx) - verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, ctx) - verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, ctx) +def test_batch_matmul(target, dev): + verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, dev) + verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, dev) + verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, dev) # Test implicit broadcasting. - verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, ctx) - verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, ctx) - verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, ctx) + verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, dev) + verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, dev) + verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, dev) -def verify_simple_dynamic_model(a_shape, b_shape, target, ctx): +def verify_simple_dynamic_model(a_shape, b_shape, target, dev): def verify_model(ex, a_shape, b_shape): a_array = np.random.uniform(size=a_shape).astype("float32") b_array = np.random.uniform(size=b_shape).astype("float32") @@ -1139,7 +1140,7 @@ def verify_model(ex, a_shape, b_shape): mod, params = relay.frontend.from_onnx(model, {"a": a_anys, "b": b_anys}) - ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + ex = relay.create_executor("vm", mod=mod, device=dev, target=target) verify_model(ex, a_shape, b_shape) verify_model(ex, [a * 2 for a in a_shape], [b * 2 for b in b_shape]) verify_model(ex, [a * 3 for a in a_shape], [b * 3 for b in b_shape]) @@ -1147,10 +1148,10 @@ def verify_model(ex, a_shape, b_shape): # TODO(mbrookhart): enable cuda once VM supports heterogenous execution @tvm.testing.parametrize_targets("llvm") -def test_batch_matmul_dynamic_model(target, ctx): - verify_simple_dynamic_model((2, 3, 4, 3), (2, 3, 3, 4), target, ctx) - verify_simple_dynamic_model((2, 4, 3), (3, 4), target, ctx) - verify_simple_dynamic_model((2, 3, 4, 3), (3, 4), target, ctx) +def test_batch_matmul_dynamic_model(target, dev): + verify_simple_dynamic_model((2, 3, 4, 3), (2, 3, 3, 4), target, dev) + verify_simple_dynamic_model((2, 4, 3), (3, 4), target, dev) + verify_simple_dynamic_model((2, 3, 4, 3), (3, 4), target, dev) def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None): @@ -1313,8 +1314,8 @@ def verify_upsample3d_trilinear(): model = helper.make_model(graph, producer_name="upsample_trilinear_test") # TODO(jwfromm): Trilinear upsampling not supported in 1.0.0 onnxruntime. # Replace topi comparison with verify_with_ort once we update. - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32") + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output(model, in_array, target, dev, out_shape, "float32") tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5) @@ -3878,8 +3879,8 @@ def verify_if(cond_array): # TODO(jwfromm): Onnxruntime 1.0.0 is buggy with If statements. Replace this with # verify_with_ort once we update versions. - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True) + for target, dev in tvm.testing.enabled_targets(): + tvm_out = get_tvm_output_with_vm(if_model, [cond], target, dev, freeze_params=True) for i in range(len(tvm_out)): tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05) diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 572aa472c5400..6416043068da3 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -206,9 +206,9 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at compiled_input = dict(zip(input_names, [inp.clone().cpu().numpy() for inp in baseline_input])) with tvm.transform.PassContext(opt_level=3): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params) - relay_model = graph_runtime.create(relay_graph, relay_lib, ctx) + relay_model = graph_runtime.create(relay_graph, relay_lib, dev) relay_model.set_input(**relay_params) for name, inp in compiled_input.items(): relay_model.set_input(name, inp) @@ -2151,9 +2151,9 @@ def verify_model_vm(input_model, ishapes, idtype=None, idata=None, targets=["llv for tgt in targets: print("Running on target", tgt) - ctx = tvm.context(tgt, 0) + dev = tvm.device(tgt, 0) - executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt) + executor = relay.create_executor("vm", mod=mod, device=dev, target=tgt) evaluator = executor.evaluate() # Inference @@ -3589,8 +3589,8 @@ def test_forward_pretrained_bert_base_uncased(): # Execute on TVM # -------------- - ctx = tvm.context(target, 0) - relay_model = graph_runtime.create(relay_graph, relay_lib, ctx) + dev = tvm.device(target, 0) + relay_model = graph_runtime.create(relay_graph, relay_lib, dev) relay_model.set_input(**relay_params) relay_model.set_input(input_1, tokens_tensor) relay_model.set_input(input_2, segments_tensors) diff --git a/tests/python/frontend/pytorch/test_lstm.py b/tests/python/frontend/pytorch/test_lstm.py index 1197990f54bab..9089a83239e46 100644 --- a/tests/python/frontend/pytorch/test_lstm.py +++ b/tests/python/frontend/pytorch/test_lstm.py @@ -222,8 +222,8 @@ def assert_equal(tvm_result, torch_result): ) -def run_and_compare(mod, params, pt_result, target, ctx): - executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) +def run_and_compare(mod, params, pt_result, target, device): + executor = relay.create_executor("vm", mod=mod, device=device, target=target) evaluator = executor.evaluate() exec_res = evaluator(**params) @@ -249,7 +249,7 @@ def flatten(nested): def convert_list_to_vmobj(py_lst): def wrap_nd_array(arr): - return tvm.nd.array(arr, ctx=tvm.cpu(0)) + return tvm.nd.array(arr, device=tvm.cpu(0)) mod = tvm.IRModule() prelude = Prelude(mod) @@ -365,6 +365,6 @@ def test_custom_lstm(): else: params[states_name] = states_np - for tgt, ctx in tvm.testing.enabled_targets(): + for tgt, dev in tvm.testing.enabled_targets(): print("Running %s on target %s" % (name, tgt)) - run_and_compare(mod, params, pt_result, target=tgt, ctx=ctx) + run_and_compare(mod, params, pt_result, target=tgt, device=dev) diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py index a404a88393bcc..3d51f0e586555 100644 --- a/tests/python/frontend/pytorch/test_object_detection.py +++ b/tests/python/frontend/pytorch/test_object_detection.py @@ -117,8 +117,8 @@ def compile_and_run_vm(mod, params, data_np, target): with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target=target, params=params) - ctx = tvm.context(target, 0) - vm = VirtualMachine(vm_exec, ctx) + dev = tvm.device(target, 0) + vm = VirtualMachine(vm_exec, dev) vm.set_input("main", **{input_name: data_np}) return vm.run() diff --git a/tests/python/frontend/tensorflow/test_bn_dynamic.py b/tests/python/frontend/tensorflow/test_bn_dynamic.py index ac02ef1469687..81a2db4704e62 100644 --- a/tests/python/frontend/tensorflow/test_bn_dynamic.py +++ b/tests/python/frontend/tensorflow/test_bn_dynamic.py @@ -59,7 +59,7 @@ def verify_fused_batch_norm(shape): constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, ["output"]) for device in ["llvm"]: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue @@ -68,7 +68,7 @@ def verify_fused_batch_norm(shape): graph, lib, params = relay.build(mod, target=device, params=params) from tvm.contrib import graph_runtime - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, dev) m.set_input(**params) m.set_input("input", data) m.run() diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 22afe8f88f663..4accd16a25672 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -132,9 +132,9 @@ def run_tvm_graph( mod, params = relay.frontend.from_tensorflow( graph_def, layout=layout, shape=shape_dict, outputs=out_names ) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) if mode == "debug": - ex = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(), target="llvm") + ex = relay.create_executor(mode, mod=mod, device=tvm.cpu(), target="llvm") inputs = [] for param in mod["main"].params: found = False @@ -167,7 +167,7 @@ def run_tvm_graph( graph, lib, params = relay.build(mod, target, target_host, params) from tvm.contrib import graph_runtime - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, dev) # set inputs for e, i in zip(input_node, input_data): if e != "": @@ -237,7 +237,7 @@ def name_without_num(name): devices = targets if targets else ["llvm", "cuda"] for device in devices: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue @@ -1689,7 +1689,7 @@ def test_forward_variable(): @tvm.testing.parametrize_targets("llvm", "cuda") -def test_read_variable_op(target, ctx): +def test_read_variable_op(target, dev): """ Read Variable op test """ tf.reset_default_graph() @@ -3717,7 +3717,7 @@ def test_forward_resnetv2(): with tf.Session() as sess: tf_output = run_tf_graph(sess, data, "input_tensor:0", out_node + ":0") for device in ["llvm", "cuda"]: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue @@ -3754,7 +3754,7 @@ def _test_ssd_impl(): ) # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready. for device in ["llvm"]: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue @@ -3858,8 +3858,8 @@ def _get_tvm_graph_module(graph_def): graph, lib, params = relay.build(mod, target, params=params) from tvm.contrib import graph_runtime - ctx = tvm.cpu(0) - return params, graph_runtime.create(graph, lib, ctx) + dev = tvm.cpu(0) + return params, graph_runtime.create(graph, lib, dev) def _do_tvm_sample(model, data, in_states, params, num_samples): """Sampled from the model""" @@ -5271,7 +5271,7 @@ def test_forward_dynamic_input_shape(): tf_output = run_tf_graph(sess, np_data, "data:0", ["{}:0".format(out_name)]) # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready. for device in ["llvm"]: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 7c12cd3365cab..05e0f076e0793 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -189,7 +189,7 @@ def run_tvm_graph( ) if mode in ["debug", "vm"]: - ex = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(), target="llvm") + ex = relay.create_executor(mode, mod=mod, device=tvm.cpu(), target="llvm") inputs = [] for param in mod["main"].params: found = False @@ -207,10 +207,10 @@ def run_tvm_graph( with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) from tvm.contrib import graph_runtime - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs for i, e in enumerate(input_node): m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) @@ -303,7 +303,7 @@ def compare_tflite_with_tvm( tflite_output = run_tflite_graph(tflite_model_buffer, in_data) for device in ["llvm"]: - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py index 609b6dedfb3aa..7d4eca4d33d91 100644 --- a/tests/python/integration/test_dot.py +++ b/tests/python/integration/test_dot.py @@ -33,10 +33,10 @@ def test_dot(): def verify(target): f = tvm.driver.build(s, [A, B, C], target) # verify - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index dda494d014841..034f89cd3fe2e 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -39,13 +39,13 @@ def test_exp(): def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(host): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) fexp = tvm.build(s, [A, B], device, host, name="myexp") - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # launch the kernel. n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) fexp(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) @@ -68,7 +68,7 @@ def run(dtype): bx, tx = s[C].split(C.op.axis[0], factor=num_thread) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -87,10 +87,10 @@ def check_device(device): b_np += (b_np < 2.0) * 2 a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1 - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1) tcost = ftimer(a, b, c).mean # fmod(a, b, c) np.testing.assert_allclose(c.asnumpy(), np.mod(a.asnumpy(), b.asnumpy()), rtol=1e-5) @@ -123,16 +123,16 @@ def test_multiple_cache_write(): def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(host): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): return func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write") - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # launch the kernel. n = 1024 - a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), ctx) - a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev) + a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) func(a0, a1, c) tvm.testing.assert_allclose( c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()), rtol=1e-5 @@ -156,13 +156,13 @@ def test_log_pow_llvm(): return flog = tvm.build(s, [A, B], "llvm", name="mylog") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = 1028 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) repeat = 10 - ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat) + ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat) res = ftimer(a, b) assert len(res.results) == repeat tvm.testing.assert_allclose(b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5) @@ -181,7 +181,7 @@ def run(dtype): bx, tx = s[B].split(B.op.axis[0], factor=num_thread) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -192,8 +192,8 @@ def check_device(device): func = tvm.build(s, [A, B], device) # launch the kernel. n = 1024 - a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx) - b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev) + b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev) func(a, b) tvm.testing.assert_allclose( b.asnumpy(), list(map(lambda x: bin(x).count("1"), a.asnumpy())), rtol=1e-5 @@ -233,7 +233,7 @@ def run(dtype): # one line to build the function. def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -241,10 +241,10 @@ def check_device(device): # launch the kernel. n = 1024 - a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx) - b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1) + a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev) + b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1) tcost = ftimer(a, b, c).mean tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6) @@ -285,13 +285,13 @@ def tvm_callback_cuda_compile(code): # one line to build the function. def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B], device) - a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) + a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 3, rtol=1e-6) diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py index 2e5364b24331b..fb2c6b1a3db65 100644 --- a/tests/python/integration/test_ewise_fpga.py +++ b/tests/python/integration/test_ewise_fpga.py @@ -45,13 +45,13 @@ def test_exp(): def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) fexp = tvm.build(s, [A, B], device, host, name="myexp") - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # launch the kernel. n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) fexp(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) @@ -80,15 +80,15 @@ def test_multi_kernel(): def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd") - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) # launch the kernel. n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx) - d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev) + d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev) fadd(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5) diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index 42612c262f182..5faacde30b587 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -79,7 +79,7 @@ def test_gemm(): # one line to build the function. def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -93,12 +93,12 @@ def check_device(device): l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) - ftimer = f.time_evaluator(f.entry_name, ctx, number=1) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) + ftimer = f.time_evaluator(f.entry_name, dev, number=1) tcost = ftimer(a, b, c).mean - print("%s: exec=%g sec/op" % (ctx, tcost)) + print("%s: exec=%g sec/op" % (dev, tcost)) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("vulkan") diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index e978b83aabd6f..cf140be94b86a 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -41,7 +41,7 @@ def test_prim(reducer, np_reducer): # one line to build the function. def check_device(device, host="llvm"): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -49,8 +49,8 @@ def check_device(device, host="llvm"): # launch the kernel. n = 1028 m = 129 - x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx) - y = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) + y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) freduce(x, y) npy = y.asnumpy() npy[:2] = 0 @@ -80,13 +80,13 @@ def test_init_imm(): def check_target(target="llvm"): if not tvm.runtime.enabled(target): return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) fsum(a, b) res = 10.0 + np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -108,16 +108,16 @@ def test_init(): def check_target(target="llvm"): if not tvm.runtime.enabled(target): return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) fapi = tvm.lower(s, args=[A, C, I, B]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), ctx) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), ctx) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) + ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) + b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) mmult(a, c, ii, b) res = ii.asnumpy() + np.matmul(a.asnumpy(), c.asnumpy()) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -139,13 +139,13 @@ def test_rfactor(): def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -170,16 +170,16 @@ def test_rfactor_init(): def check_target(target="llvm"): if not tvm.runtime.enabled(target): return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) fapi = tvm.lower(s, args=[A, C, I, B]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), ctx) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), ctx) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) + ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) + b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) mmult(a, c, ii, b) res = ii.asnumpy() + np.matmul(a.asnumpy(), c.asnumpy()) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -201,13 +201,13 @@ def test_rfactor_factor_axis(): def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return - ctx = tvm.cpu(0) + dev = tvm.cpu(0) fapi = tvm.lower(s, args=[A, B]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -240,7 +240,7 @@ def test_rfactor_threads(): # one line to build the function. def check_target(device, host="stackvm"): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -250,8 +250,8 @@ def check_target(device, host="stackvm"): # launch the kernel. n = nn m = mm - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) fsum(a, b) res = np.sum(a.asnumpy(), axis=1) res[:2] = 0 @@ -294,15 +294,15 @@ def test_rfactor_elemwise_threads(): # one line to build the function. def check_target(device, host="stackvm"): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fapi = tvm.lower(s, args=[A, C]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) fsum(a, b) res = np.sum(a.asnumpy(), axis=1) + 2 tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -337,7 +337,7 @@ def check_target(): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) fapi = tvm.lower(s, args=[idx, val, T0, T1]) fargmax = tvm.build(fapi, target="llvm", name="argmax") @@ -347,10 +347,10 @@ def check_target(): np_val = np.random.uniform(size=(mm, nn)).astype("float32") np_res = np.argmax(np_val, axis=1) - nd_idx = tvm.nd.array(np_idx, ctx) - nd_val = tvm.nd.array(np_val, ctx) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), ctx) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), ctx) + nd_idx = tvm.nd.array(np_idx, dev) + nd_val = tvm.nd.array(np_val, dev) + nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.asnumpy()) @@ -393,7 +393,7 @@ def fidentity(t0, t1): s[B0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -404,10 +404,10 @@ def check_target(device): np_val = np.random.uniform(size=(mm, nn)).astype("float32") np_res = np.argmax(np_val, axis=1) - nd_idx = tvm.nd.array(np_idx, ctx) - nd_val = tvm.nd.array(np_val, ctx) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), ctx) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), ctx) + nd_idx = tvm.nd.array(np_idx, dev) + nd_val = tvm.nd.array(np_val, dev) + nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.asnumpy()) @@ -425,7 +425,7 @@ def test_warp_reduction1(): thread_y = te.thread_axis((0, nthy), "threadIdx.y") def check_target(device, m, n): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -450,8 +450,8 @@ def check_target(device, m, n): func = tvm.build(s, [A, B], device, name="warp_reduction") a_np = np.random.uniform(size=(m, n)).astype(A.dtype) b_np = np.zeros((m,), dtype=A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) b_np = np.max(a_np, axis=1) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) @@ -488,7 +488,7 @@ def fidentity(t0, t1): thread_y = te.thread_axis((0, nthdy), "threadIdx.y") def check_target(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -502,15 +502,15 @@ def check_target(device): s[T0].bind(xo, block_x) # validation - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype) a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype) t0_np = np.zeros((m,), dtype=A0.dtype) t1_np = np.zeros((m,), dtype=A1.dtype) - a0 = tvm.nd.array(a0_np, ctx) - a1 = tvm.nd.array(a1_np, ctx) - t0 = tvm.nd.array(t0_np, ctx) - t1 = tvm.nd.array(t1_np, ctx) + a0 = tvm.nd.array(a0_np, dev) + a1 = tvm.nd.array(a1_np, dev) + t0 = tvm.nd.array(t0_np, dev) + t1 = tvm.nd.array(t1_np, dev) func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction") func(a0, a1, t0, t1) t0_np = np.sum(a0_np, axis=1) diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py index 73be68c3b6e2a..54a8f1e92ed15 100644 --- a/tests/python/integration/test_scan.py +++ b/tests/python/integration/test_scan.py @@ -49,7 +49,7 @@ def test_scan(): # one line to build the function. def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return @@ -58,8 +58,8 @@ def check_device(device): n = 1024 m = 10 a_np = np.random.uniform(size=(m, n)).astype(res.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev) fscan(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0)) diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py index 813352c52096e..170b4709262b8 100644 --- a/tests/python/integration/test_tuning.py +++ b/tests/python/integration/test_tuning.py @@ -142,7 +142,7 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None): @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning_gpu(target, ctx): +def test_tuning_gpu(target, dev): # init task task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py index d326993750503..aa018616c87ff 100644 --- a/tests/python/integration/test_winograd_nnpack.py +++ b/tests/python/integration/test_winograd_nnpack.py @@ -74,7 +74,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skipping %s becuase it is not enabled" % device) print("Running on target: %s" % device) @@ -86,10 +86,10 @@ def check_device(device): C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py index 55971de17b03a..4ecd5dce46498 100644 --- a/tests/python/nightly/quantization/test_quantization_accuracy.py +++ b/tests/python/nightly/quantization/test_quantization_accuracy.py @@ -92,11 +92,13 @@ def get_model(model_name, batch_size, qconfig, target=None, original=False, simu return qfunc -def eval_acc(model, dataset, batch_fn, target=tvm.target.cuda(), ctx=tvm.gpu(), log_interval=100): +def eval_acc( + model, dataset, batch_fn, target=tvm.target.cuda(), device=tvm.gpu(), log_interval=100 +): with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(model, target) # create runtime module - m = tvm.contrib.graph_runtime.create(graph, lib, ctx) + m = tvm.contrib.graph_runtime.create(graph, lib, device) m.set_input(**params) # setup evaluaiton metric diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py index 02e6ae6f51e55..c4695f971e97f 100644 --- a/tests/python/relay/benchmarking/benchmark_vm.py +++ b/tests/python/relay/benchmarking/benchmark_vm.py @@ -37,20 +37,20 @@ def benchmark_execution( model="unknown", ): def get_graph_runtime_output( - mod, data, params, target, ctx, dtype="float32", number=2, repeat=20 + mod, data, params, target, dev, dtype="float32", number=2, repeat=20 ): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", data) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) if measure: - print("Evaluate graph runtime inference cost of {} on " "{}".format(model, repr(ctx))) - ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20) + print("Evaluate graph runtime inference cost of {} on " "{}".format(model, repr(dev))) + ftimer = m.module.time_evaluator("run", dev, number=1, repeat=20) # Measure in millisecond. prof_res = np.array(ftimer().results) * 1000 print( @@ -60,15 +60,15 @@ def get_graph_runtime_output( return out.asnumpy() - def get_vm_output(mod, data, params, target, ctx, dtype="float32", number=2, repeat=20): + def get_vm_output(mod, data, params, target, dev, dtype="float32", number=2, repeat=20): with tvm.transform.PassContext(opt_level=3): exe = vm.compile(mod, target, params=params) - rly_vm = vm_rt.VirtualMachine(exe, ctx) + rly_vm = vm_rt.VirtualMachine(exe, dev) result = rly_vm.run(data) if measure: - print("Evaluate vm inference cost of {} on {}".format(model, repr(ctx))) - ftimer = rly_vm.module.time_evaluator("invoke", ctx, number=number, repeat=repeat) + print("Evaluate vm inference cost of {} on {}".format(model, repr(dev))) + ftimer = rly_vm.module.time_evaluator("invoke", dev, number=number, repeat=repeat) # Measure in millisecond. prof_res = np.array(ftimer("main", data).results) * 1000 print( @@ -81,11 +81,11 @@ def get_vm_output(mod, data, params, target, ctx, dtype="float32", number=2, rep # random input data = np.random.uniform(size=data_shape).astype(dtype) - for target, ctx in testing.enabled_targets(): + for target, dev in testing.enabled_targets(): tvm_out = get_graph_runtime_output( - mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype + mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype ) - vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype) + vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype) tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5) diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py index a520f6c2c3681..9bcb656bd246e 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level10.py +++ b/tests/python/relay/dyn/test_dynamic_op_level10.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -""" +""" Support level10 operator test cases. """ @@ -44,10 +44,10 @@ def verify_more_dynamic_broadcast_to(x_shape, out_shape): x = np.random.uniform(size=np.prod(x_shape)).astype(dtype) ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate(func)( x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type) ) @@ -70,10 +70,10 @@ def verify_broadcast_to(x_shape, out_shape): x = np.random.uniform(size=x_shape).astype(dtype) ref_res = np.broadcast_to(x, out_shape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate(func)(x, np.array(out_shape).astype(shape_type)) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -100,10 +100,10 @@ def test_dyn_broadcast_to(): x = np.random.uniform(size=x_shape).astype(dtype) dyn_shape = (1,) * rank ref_res = np.broadcast_to(x, dyn_shape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate(func)(x, np.array(dyn_shape).astype(shape_type)) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -133,10 +133,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): func = relay.Function([indices, depth_var], out) indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32") out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) out_relay = intrp.evaluate()(indices_np, np.array(depth).astype("int32")) tvm.testing.assert_allclose(out_relay.asnumpy(), out_np) diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py index 5ef975f97d2c6..c0cbce17e9d05 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level2.py +++ b/tests/python/relay/dyn/test_dynamic_op_level2.py @@ -56,10 +56,10 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa zz = run_infer_type(z) func = relay.Function([x, scale_h_var, scale_w_var], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()( x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32") ) @@ -125,10 +125,10 @@ def verify_upsampling3d( zz = run_infer_type(z) func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z) - for target, ctx in enabled_targets(): + for target, dev in enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()( x_data, np.array(scale_d).astype("float32"), diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py index d5f81e84e39db..78d12f9b1c8ec 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level3.py +++ b/tests/python/relay/dyn/test_dynamic_op_level3.py @@ -26,12 +26,12 @@ import tvm.testing -def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()): +def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()): assert isinstance(data, list) - for target, ctx in target_ctx: + for target, dev in target_device: for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(*data) if isinstance(op_res, tvm.runtime.container.ADT): assert len(op_res) == len( diff --git a/tests/python/relay/dyn/test_dynamic_op_level4.py b/tests/python/relay/dyn/test_dynamic_op_level4.py index 3d7a99a28e33f..3cb706440cade 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level4.py +++ b/tests/python/relay/dyn/test_dynamic_op_level4.py @@ -57,9 +57,9 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, if not test_ref: return - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor("vm", mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py index 9273b019ec96b..c49ac9680266b 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level5.py +++ b/tests/python/relay/dyn/test_dynamic_op_level5.py @@ -60,10 +60,10 @@ def verify_resize(dshape, scale, method, layout): zz = run_infer_type(z) func = relay.Function([x, size_var], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data, size) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6) diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py index 52abbe2a15b6d..9ceb9ab9db975 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level6.py +++ b/tests/python/relay/dyn/test_dynamic_op_level6.py @@ -52,10 +52,10 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype): np_values[i, :] = np_data[i, np_indices[i, :]] np_indices = np_indices.astype(dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(np_data, np.array([k]).astype("float32")) if ret_type == "both": tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values) diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py index 122fa67d65dfd..c432596a2a9fe 100644 --- a/tests/python/relay/test_adt.py +++ b/tests/python/relay/test_adt.py @@ -33,8 +33,8 @@ def count(e): return count_(p, e) -ctx = tvm.context("llvm", 0) -intrp = create_executor(mod=prelude.mod, ctx=ctx, target="llvm") +dev = tvm.device("llvm", 0) +intrp = create_executor(mod=prelude.mod, device=dev, target="llvm") nat, z, s = prelude.mod.get_type("nat") diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 32292de4c8ea1..9f777c29bef66 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -53,12 +53,12 @@ def check_result( expected = [expected] for kind in ["debug", "vm"]: targets = targets or tvm.testing.enabled_targets() - for tgt, ctx in targets: + for tgt, dev in targets: if disable_targets and tgt in disable_targets: continue - if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type): + if kind == "debug" and (only_vm or dev.device_type != tvm.cpu().device_type): continue - ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt) + ex = relay.create_executor(kind, mod=mod, device=dev, target=tgt) result = ex.evaluate()(*args) if isinstance(result, tvm.runtime.container.ADT): result = [r.asnumpy() for r in result] @@ -751,7 +751,7 @@ def verify_any_split(data_shape, indices_or_sections, axis, static_data_shape, r mod["main"] = relay.Function([data], y.astuple()) data_np = np.random.uniform(size=static_data_shape).astype(dtype) for kind in ["vm"]: - ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm") + ex = relay.create_executor(kind, mod=mod, device=tvm.cpu(), target="llvm") result = ex.evaluate()(data_np) for ret, ref_ret in zip(result, ref_out_shape): assert ret.asnumpy().shape == ref_ret, "Shape mismatch: expect %s but got %s." % ( @@ -964,9 +964,9 @@ def test_any_get_valid_counts(): # Check failed: err_code == CL_SUCCESS == false: OpenCL Error, # code=-61: CL_INVALID_BUFFER_SIZE targets = [] - for tgt, ctx in tvm.testing.enabled_targets(): + for tgt, dev in tvm.testing.enabled_targets(): if "opencl" not in tgt: - targets.append((tgt, ctx)) + targets.append((tgt, dev)) verify_any_get_valid_counts(0, "float32", targets=targets) diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py index 56b57e1fad76e..95f1177da024d 100644 --- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py +++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py @@ -168,8 +168,8 @@ def tune_and_check(mod, data, weight): lib2 = relay.build(mod, target=target, params={"weight": weight}) def get_output(data, lib): - ctx = tvm.cpu() - module = graph_runtime.GraphModule(lib["default"](ctx)) + dev = tvm.cpu() + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py index 1ec0e305311a1..af930e7f7f19a 100644 --- a/tests/python/relay/test_auto_scheduler_tuning.py +++ b/tests/python/relay/test_auto_scheduler_tuning.py @@ -69,8 +69,8 @@ def tune_network(network, target): # Check the correctness def get_output(data, lib): - ctx = tvm.gpu() - module = graph_runtime.GraphModule(lib["default"](ctx)) + dev = tvm.gpu() + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).asnumpy() diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py index bf53dc5360e3a..42b6373b45f1e 100644 --- a/tests/python/relay/test_backend_compile_engine.py +++ b/tests/python/relay/test_backend_compile_engine.py @@ -184,11 +184,11 @@ def get_func(shape): # Test JIT target for target in ["llvm"]: - ctx = tvm.context(target) + dev = tvm.device(target) if tvm.testing.device_enabled(target): f = engine.jit(get_func((10,)), target) - x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx) - y = tvm.nd.empty((10,), ctx=ctx) + x = tvm.nd.array(np.ones(10).astype("float32"), device=dev) + y = tvm.nd.empty((10,), device=dev) f(x, y) tvm.testing.assert_allclose(y.asnumpy(), x.asnumpy() * 3) engine.dump() diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py index 68708aaeb413f..6f75c43579a72 100644 --- a/tests/python/relay/test_backend_graph_runtime.py +++ b/tests/python/relay/test_backend_graph_runtime.py @@ -102,7 +102,7 @@ def test_with_params(): y_data = np.random.rand(1, 5).astype("float32") params = {"y": y_data} graph, lib, params = relay.build(tvm.IRModule.from_expr(func), "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input(**params) mod.set_input(x=x_data) mod.run() @@ -171,10 +171,10 @@ def unit_numpy(X, W): out_shape = (1, rnn_dim) z = unit(rnn_dim) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): with tvm.transform.PassContext(opt_level=2): graph, lib, params = relay.build(tvm.IRModule.from_expr(z), target) - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, dev) m.set_input("X", tvm.nd.array(x.astype(dtype))) m.set_input("y", tvm.nd.array(y.astype(dtype))) m.set_input(**params) @@ -194,7 +194,7 @@ def test_compile_nested_tuples(): func = relay.Function([x], out) graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm") - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) x_data = np.random.uniform(size=(10,)).astype(np.float32) mod.set_input(x=x_data) @@ -215,7 +215,7 @@ def test_graph_executor_nested_tuples(): func = relay.Function([x, y, z, w], out) exe = relay.create_executor( - kind="graph", mod=tvm.IRModule.from_expr(func), ctx=tvm.cpu(0), target="llvm" + kind="graph", mod=tvm.IRModule.from_expr(func), device=tvm.cpu(0), target="llvm" ) f = exe.evaluate() diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py index 0beb93deaef5e..b5d76030f41d6 100644 --- a/tests/python/relay/test_backend_interpreter.py +++ b/tests/python/relay/test_backend_interpreter.py @@ -29,10 +29,10 @@ def check_eval(expr, args, expected_result, mod=None, rtol=1e-07): # TODO(tqchen) add more types once the schedule register is fixed. for target in ["llvm"]: - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): return - intrp = create_executor(mod=mod, ctx=ctx, target=target) + intrp = create_executor(mod=mod, device=dev, target=target) result = intrp.evaluate(expr)(*args) # use tvm.testing which also set atol tvm.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol) @@ -220,9 +220,9 @@ def test_tuple_passing(): mod[gv] = fn mod = relay.transform.InferType()(mod) - ctx = tvm.cpu() + dev = tvm.cpu() target = tvm.target.Target("llvm") - exec = relay.create_executor(mod=mod, ctx=ctx, target=target) + exec = relay.create_executor(mod=mod, device=dev, target=target) f = exec.evaluate(gv) # First use a Python tuple. out = f((10, 8)) diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py index 60f3dfa76e380..6f13533f93185 100644 --- a/tests/python/relay/test_cpp_build_module.py +++ b/tests/python/relay/test_cpp_build_module.py @@ -25,7 +25,7 @@ def test_basic_build(): tgt = "llvm" - ctx = tvm.cpu() + dev = tvm.cpu() # func a = relay.var("a", dtype="float32", shape=(16, 8)) b = relay.var("b", dtype="float32", shape=(8, 8)) @@ -34,12 +34,12 @@ def test_basic_build(): y = relay.nn.relu(x) z = y + c func = relay.Function([a, b, c], z) - A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx) - B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), ctx=ctx) - C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx) + A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev) + B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), device=dev) + C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev) params = {"b": B, "c": C} # build - targets = {tvm.tir.IntImm("int32", ctx.device_type): tgt} + targets = {tvm.tir.IntImm("int32", dev.device_type): tgt} mod = tvm.IRModule.from_expr(func) func_in_mod = mod["main"] assert mod["main"] == func_in_mod, "cannot compare function to itself" @@ -48,7 +48,7 @@ def test_basic_build(): assert mod["main"] == func_in_mod, "relay.build changed module in-place" # test - rt = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx)) + rt = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) rt.set_input("a", A) rt.run() out = rt.get_output(0) @@ -65,8 +65,8 @@ def test_basic_build(): def test_fp16_build(): dtype = "float16" - ctx = tvm.gpu(0) - if dtype == "float16" and not have_fp16(ctx.compute_version): + dev = tvm.gpu(0) + if dtype == "float16" and not have_fp16(dev.compute_version): print("skip because gpu does not support fp16") return @@ -74,8 +74,8 @@ def test_fp16_build(): y = relay.var("y", dtype=dtype, shape=(4, 4)) z = x + y func = relay.Function([x, y], z) - X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx) - Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx) + X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) + Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) params = { "x": X, "y": Y, @@ -85,7 +85,7 @@ def test_fp16_build(): g_json, mmod, params = relay.build(func, "cuda", params=params) # test - rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx) + rt = tvm.contrib.graph_runtime.create(g_json, mmod, dev) rt.load_params(runtime.save_param_dict(params)) rt.run() out = rt.get_output(0) @@ -94,8 +94,8 @@ def test_fp16_build(): @tvm.testing.parametrize_targets("llvm", "cuda") -def test_fp16_conversion(target, ctx): - if target == "cuda" and not have_fp16(ctx.compute_version): +def test_fp16_conversion(target, dev): + if target == "cuda" and not have_fp16(dev.compute_version): print("skip because gpu does not support fp16") return @@ -114,7 +114,7 @@ def test_fp16_conversion(target, ctx): g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target) # test - rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx) + rt = tvm.contrib.graph_runtime.create(g_json, mmod, dev) rt.set_input("x", X) rt.run() out = rt.get_output(0) diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py index ab6695eef0fb3..9241f8e827459 100644 --- a/tests/python/relay/test_external_codegen.py +++ b/tests/python/relay/test_external_codegen.py @@ -48,7 +48,7 @@ def update_lib(lib): return lib -def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu()): +def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()): if sys.platform == "win32": print("Skip test on Windows for now") return @@ -59,7 +59,7 @@ def check_vm_result(): code, lib = exe.save() lib = update_lib(lib) exe = runtime.vm.Executable.load_exec(code, lib) - vm = runtime.vm.VirtualMachine(exe, ctx) + vm = runtime.vm.VirtualMachine(exe, device) out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) @@ -67,12 +67,12 @@ def check_graph_runtime_result(): with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): json, lib, _ = relay.build(mod, target=target) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx) + rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.run() - out = tvm.nd.empty(out_shape, ctx=ctx) + out = tvm.nd.empty(out_shape, device=device) out = rt_mod.get_output(0, out) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) @@ -290,7 +290,7 @@ def test_extern_dnnl(): i_data = np.random.uniform(0, 1, ishape).astype(dtype) w_data = np.random.uniform(0, 1, w1shape).astype(dtype) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu()) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()) ref_res = ref_ex.evaluate()(i_data, w_data, w_data) check_result( mod, {"data0": i_data, "weight0": w_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5 @@ -329,7 +329,7 @@ def test_extern_dnnl_const(): i_data = np.random.uniform(0, 1, ishape).astype(dtype) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu()) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()) ref_res = ref_ex.evaluate()(i_data) check_result(mod, {"data0": i_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5) diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py index df4dff81b03e8..79d1f710cf416 100644 --- a/tests/python/relay/test_json_runtime.py +++ b/tests/python/relay/test_json_runtime.py @@ -40,7 +40,7 @@ def set_func_attr(func, compile_name, symbol_name): def check_result( - mod, ref_mod, map_inputs, out_shape, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None + mod, ref_mod, map_inputs, out_shape, tol=1e-5, target="llvm", device=tvm.cpu(), params=None ): if sys.platform == "win32": print("Skip test on Windows for now") @@ -50,13 +50,13 @@ def check_result( compile_engine.get().clear() with tvm.transform.PassContext(opt_level=3): json, lib, param = relay.build(ref_mod, target=target, params=params) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx) + rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.set_input(**param) rt_mod.run() - out = tvm.nd.empty(out_shape, ctx=ctx) + out = tvm.nd.empty(out_shape, device=device) out = rt_mod.get_output(0, out) ref_result = out.asnumpy() @@ -66,7 +66,7 @@ def check_vm_result(): exe = relay.vm.compile(mod, target=target, params=params) code, lib = exe.save() exe = runtime.vm.Executable.load_exec(code, lib) - vm = runtime.vm.VirtualMachine(exe, ctx) + vm = runtime.vm.VirtualMachine(exe, device) out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol) @@ -74,13 +74,13 @@ def check_graph_runtime_result(): compile_engine.get().clear() with relay.build_config(opt_level=3): json, lib, param = relay.build(mod, target=target, params=params) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx) + rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.set_input(**param) rt_mod.run() - out = tvm.nd.empty(out_shape, ctx=ctx) + out = tvm.nd.empty(out_shape, device=device) out = rt_mod.get_output(0, out) tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol) @@ -636,8 +636,8 @@ def test_partial_constant(): data3 = np.random.uniform(0, 1, ishape).astype(dtype) params = { - "in_1": tvm.nd.array(data1, ctx=tvm.cpu(0)), - "in_3": tvm.nd.array(data3, ctx=tvm.cpu(0)), + "in_1": tvm.nd.array(data1, device=tvm.cpu(0)), + "in_3": tvm.nd.array(data3, device=tvm.cpu(0)), } ref_mod["main"] = bind_params_by_name(ref_mod["main"], params) diff --git a/tests/python/relay/test_op_fast_math.py b/tests/python/relay/test_op_fast_math.py index 1658658be2799..a82efb7ad2ada 100644 --- a/tests/python/relay/test_op_fast_math.py +++ b/tests/python/relay/test_op_fast_math.py @@ -42,10 +42,10 @@ def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"): func_name = "fused_" + name assert lib.get_function(func_name) - ctx = tvm.cpu(0) - m = graph_runtime.create(graph, lib, ctx) + dev = tvm.cpu(0) + m = graph_runtime.create(graph, lib, dev) # Set inputs - m.set_input("x", tvm.nd.array(a_np, ctx)) + m.set_input("x", tvm.nd.array(a_np, dev)) m.set_input(**params) # Execute m.run() diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py index 0ac604c6bca1d..6c6c727c788fd 100644 --- a/tests/python/relay/test_op_grad_level1.py +++ b/tests/python/relay/test_op_grad_level1.py @@ -53,8 +53,8 @@ def check_single_op(opfunc, ref, dtype): fwd_func = run_infer_type(fwd_func) bwd_func = run_infer_type(gradient(fwd_func)) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad, _) = intrp.evaluate(bwd_func)(data, grad_in) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) @@ -104,8 +104,8 @@ def check_binary_op(opfunc, ref, dtype): fwd_func = run_infer_type(fwd_func) bwd_func = run_infer_type(gradient(fwd_func)) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad0, op_grad1) = intrp.evaluate(bwd_func)(x_data, y_data) np.testing.assert_allclose(op_grad0.asnumpy(), ref_grad0, rtol=0.01) np.testing.assert_allclose(op_grad1.asnumpy(), ref_grad1, rtol=0.01) diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py index bcf75de7915b6..b855065186c20 100644 --- a/tests/python/relay/test_op_grad_level2.py +++ b/tests/python/relay/test_op_grad_level2.py @@ -50,8 +50,8 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode): ceil_mode=ceil_mode, ) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad,) = intrp.evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) @@ -99,8 +99,8 @@ def verify_avg_pool2d_grad( ceil_mode=ceil_mode, ) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad,) = intrp.evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) @@ -155,8 +155,8 @@ def verify_global_avg_pool2d_grad(x_shape): ceil_mode=False, ) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad,) = intrp.evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py index d43744b38e3eb..e394eaa45a82c 100644 --- a/tests/python/relay/test_op_grad_level3.py +++ b/tests/python/relay/test_op_grad_level3.py @@ -40,8 +40,8 @@ def test_clip(): fwd_func = run_infer_type(fwd_func) bwd_func = run_infer_type(gradient(fwd_func)) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) op_res, (op_grad,) = intrp.evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) @@ -173,8 +173,8 @@ def test_zeros_ones_grad_dynamic(): fwd_func = relay.Function([shape_data], op(shape_data, dtype="float32")) bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor(ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor(device=dev, target=target) res, (grad,) = intrp.evaluate(bwd_func)(dyn_shape) tvm.testing.assert_allclose(res.asnumpy(), op_ref(dyn_shape, dtype="float32")) tvm.testing.assert_allclose(grad.asnumpy(), np.zeros((rank,), dtype="int32")) diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index dfd350486c3b4..91b37135fbe1a 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -61,7 +61,7 @@ def check_single_op(opfunc, ref, dtype): data = np.random.rand(*shape).astype(dtype) ref_res = ref(data) func = relay.Function([x], y) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. if ( @@ -70,7 +70,7 @@ def check_single_op(opfunc, ref, dtype): and not have_fp16(tvm.gpu(0).compute_version) ): continue - intrp = relay.create_executor("graph", ctx=ctx, target=target) + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01) @@ -123,7 +123,7 @@ def check_binary_op(opfunc, ref, dtype): ref_res = ref(x_data, y_data) func = relay.Function([x, y], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. if ( @@ -132,7 +132,7 @@ def check_binary_op(opfunc, ref, dtype): and not have_fp16(tvm.gpu(0).compute_version) ): continue - intrp = relay.create_executor("graph", ctx=ctx, target=target) + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01, atol=1e-3) @@ -154,7 +154,7 @@ def test_expand_dims(): def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis): x = relay.Var("x", relay.TensorType(dshape, dtype)) func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis)) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" @@ -163,7 +163,7 @@ def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis): continue data = np.random.uniform(size=dshape).astype(dtype) ref_res = data.reshape(oshape) - intrp = relay.create_executor("graph", ctx=ctx, target=target) + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01) @@ -189,14 +189,14 @@ def test_bias_add(): x_data = np.random.uniform(size=xshape).astype(dtype) y_data = np.random.uniform(size=bshape).astype(dtype) ref_res = x_data + y_data.reshape((2, 1, 1)) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version) ): continue - intrp = relay.create_executor("graph", ctx=ctx, target=target) + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol) @@ -239,8 +239,8 @@ def test_softmax(): func = relay.Function([x], y) x_data = np.random.uniform(size=shape).astype(dtype) ref_res = tvm.topi.testing.softmax_python(x_data) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -260,8 +260,8 @@ def test_log_softmax(): func = relay.Function([x], y) x_data = np.random.uniform(size=shape).astype(dtype) ref_res = tvm.topi.testing.log_softmax_python(x_data) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -310,15 +310,15 @@ def test_concatenate(): t_data = np.random.uniform(size=()).astype(dtype) ref_res = np.concatenate((x_data, y_data), axis=1) + t_data - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version) ): continue - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data, y_data, t_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=0.01) op_res2 = intrp2.evaluate(func)(x_data, y_data, t_data) @@ -339,9 +339,9 @@ def test_dropout(): x = relay.const(in_np) y = relay.nn.dropout(x, rate=0.5) func = relay.Function([], y) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for backend in ["debug", "graph"]: - intrp = relay.create_executor("debug", ctx=ctx, target=target) + intrp = relay.create_executor("debug", device=dev, target=target) op_res = intrp.evaluate(func)() tvm.testing.assert_allclose(op_res.asnumpy(), in_np, rtol=0.01) @@ -460,9 +460,9 @@ def test_dense(): w_data = np.random.rand(2, 5).astype(dtype) ref_res = np.dot(x_data, w_data.T) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data, w_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data, w_data) diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 3ec1a5bb61292..597a1c69e8ee3 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -40,9 +40,9 @@ def test_checkpoint(): assert f.checked_type == f_checkpoint.checked_type inputs = [np.random.uniform() for _ in range(len(xs))] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) f_res = intrp.evaluate(f)(*inputs) f_checkpoint_res = intrp.evaluate(f_checkpoint)(*inputs) tvm.testing.assert_allclose(f_res.asnumpy(), f_checkpoint_res.asnumpy(), 0, 0) @@ -172,9 +172,9 @@ def test_collapse_sum_like(): x = np.random.uniform(size=shape).astype(dtype) y = np.random.uniform(size=shape_like).astype(dtype) ref_res = np.sum(x, 0) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x, y) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -192,9 +192,9 @@ def test_collapse_sum_to(): func = relay.Function([x], z) x = np.random.uniform(size=shape).astype(dtype) ref_res = np.sum(x, 0) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -212,9 +212,9 @@ def test_broadcast_to(): func = relay.Function([x], z) x = np.random.uniform(size=shape).astype(dtype) ref_res = np.broadcast_to(x, shape_like) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -236,9 +236,9 @@ def test_broadcast_to_like(): y = np.random.uniform(size=shape_like).astype(dtype) ref_res = np.broadcast_to(x, shape_like) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x, y) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -281,9 +281,9 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"): y_data = np.random.uniform(size=slice_like).astype(dtype) ref_res = np_slice_like(x_data, y_data, axes) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -315,9 +315,9 @@ def verify_reverse_reshape(shape, newshape, oshape): func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32") ref_res = np.reshape(x_data, oshape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -340,9 +340,9 @@ def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"): y_np = np.random.uniform(size=y_shape).astype(dtype) z_np = tvm.topi.testing.batch_matmul(x_np, y_np) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) z = intrp.evaluate(func)(x_np, y_np) tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5) @@ -372,10 +372,10 @@ def verify_dynamic_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"): y_np = np.random.uniform(size=y_shape).astype(dtype) z_np = tvm.topi.testing.batch_matmul(x_np, y_np) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) z = intrp.evaluate()(x_np, y_np) tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5) @@ -396,11 +396,11 @@ def test_shape_of(): func = relay.Function([x], relay.op.shape_of(x)) func = run_infer_type(func) x_data = np.random.rand(*shape).astype("float32") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # Because using graph executor, this op will be optimized after # constant folding pass, here we only test with interpreter for kind in ["debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), np.array(shape).astype("int32")) @@ -414,9 +414,9 @@ def verify_ndarray_size(shape): x_data = np.random.uniform(size=shape).astype("float32") ref_res = np.size(x_data) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -433,8 +433,8 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc): np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype) np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) relay_out = intrp1.evaluate(func)(np_data) tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5) @@ -481,9 +481,9 @@ def _verify(data_shape, mask_value, axis, dtype, itype): valid_length_np = np.random.randint(0, max_length, size=nbatch).astype(itype) gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) out_relay = intrp.evaluate(func)(data_np, valid_length_np) tvm.testing.assert_allclose(out_relay.asnumpy(), gt_out_np) @@ -521,9 +521,9 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32") out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) out_relay = intrp.evaluate(func)(indices_np) tvm.testing.assert_allclose(out_relay.asnumpy(), out_np) @@ -551,9 +551,9 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): diagonal_np = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype) out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) out_relay = intrp.evaluate(func)(input_np, diagonal_np) tvm.testing.assert_allclose(out_relay.asnumpy(), out_np) diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 1a1f451f4c74e..b9e7b075a9985 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -96,11 +96,11 @@ def run_test_conv1d( data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, dilation ) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target in except_targets: continue - ctx = tvm.context(target, 0) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + dev = tvm.device(target, 0) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -224,11 +224,11 @@ def run_test_conv2d( else: ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype)) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target in except_targets: continue - ctx = tvm.context(target, 0) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + dev = tvm.device(target, 0) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4, atol=1e-4) @@ -399,13 +399,13 @@ def run_test_conv2d_cuda( ) with WinogradFallback(), tvm.transform.PassContext(opt_level=3): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target != "cuda": continue - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) params = {"w": tvm.nd.array(kernel)} graph, lib, params = relay.build_module.build(mod, target=target, params=params) - module = tvm.contrib.graph_runtime.create(graph, lib, ctx) + module = tvm.contrib.graph_runtime.create(graph, lib, dev) module.set_input("x", tvm.nd.array(data)) module.set_input(**params) module.run() @@ -510,12 +510,12 @@ def run_test_conv3d( else: ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype)) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target in except_targets: continue - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -575,12 +575,12 @@ def run_test_conv3d( else: ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype)) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target in except_targets: continue - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -662,13 +662,13 @@ def run_test_conv3d_cuda( ) with WinogradFallback(), tvm.transform.PassContext(opt_level=3): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target != "cuda": continue - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) params = {"w": tvm.nd.array(kernel)} graph, lib, params = relay.build_module.build(mod, target=target, params=params) - module = tvm.contrib.graph_runtime.create(graph, lib, ctx) + module = tvm.contrib.graph_runtime.create(graph, lib, dev) module.set_input("x", tvm.nd.array(data)) module.set_input(**params) module.run() @@ -762,8 +762,8 @@ def test_conv3d_transpose_ncdhw_run(): kernel = np.random.uniform(size=kshape).astype(dtype) ref_res = tvm.topi.testing.conv3d_transpose_ncdhw_python(data, kernel, 1, 1, 0) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -805,8 +805,8 @@ def test_conv2d_transpose_nchw_run(): kernel = np.random.uniform(size=kshape).astype(dtype) ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1)) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -841,8 +841,8 @@ def test_conv2d_transpose_nhwc_run(): data, kernel, "HWOI", 2, 1, output_padding=(1, 1) ) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -863,8 +863,8 @@ def test_conv1d_transpose_ncw_run(): kernel = np.random.uniform(size=kshape).astype(dtype) ref_res = tvm.topi.testing.conv1d_transpose_ncw_python(data, kernel, 2, 1, output_padding=(1,)) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -943,8 +943,8 @@ def _test_pool2d(opfunc, reffunc, pool_size=(2, 2), strides=(2, 2), padding=(0, func = relay.Function([x], y) data = np.random.uniform(size=dshape).astype(dtype) ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -965,8 +965,8 @@ def _test_pool2d_int(opfunc, reffunc, dtype): func = relay.Function([x], y) data = np.random.randint(low=-128, high=128, size=dshape) ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -991,8 +991,8 @@ def _test_global_pool2d(opfunc, reffunc): func = relay.Function([x], y) data = np.random.uniform(size=dshape).astype(dtype) ref_res = reffunc(data, axis=(2, 3), keepdims=True) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1029,8 +1029,8 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0), dtype="fl ref_res = tvm.topi.testing.pool1d_ncw_python( data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False ) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1075,8 +1075,8 @@ def _test_pool3d( ref_res = tvm.topi.testing.pool3d_ncdhw_python( data, pool_size, strides, padding, out_shape, pool_type, False ) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1125,8 +1125,8 @@ def test_avg_pool2d_no_count_pad(): ref_res = np.maximum(b_np, 0.0) data = a_np - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1160,9 +1160,9 @@ def test_flatten_infer_type(): x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = x_data.flatten().reshape(o_shape) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) @@ -1223,8 +1223,8 @@ def _test_run(dtype): mod_pad.append((pad_x, pad_y)) ref_res = np.pad(mod_data, tuple(mod_pad), "constant") - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1256,9 +1256,9 @@ def test_lrn(): x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) @@ -1286,9 +1286,9 @@ def test_l2_normalize(): x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) @@ -1311,8 +1311,8 @@ def test_batch_flatten(): data = np.random.rand(5, 10, 5).astype(t1.dtype) ref_res = batch_flatten(data) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01) @@ -1359,8 +1359,8 @@ def get_shape(): ref = tvm.topi.testing.bilinear_resize_python( data, (int(round(h * scale_h)), int(round(w * scale_w))), layout ) - for target, ctx in tvm.testing.enabled_targets(): - executor = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + executor = relay.create_executor("graph", device=dev, target=target) out = executor.evaluate(func)(data) tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5) @@ -1431,8 +1431,8 @@ def get_shape(): (int(round(d * scale_d)), int(round(h * scale_h)), int(round(w * scale_w))), layout, ) - for target, ctx in tvm.testing.enabled_targets(): - executor = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + executor = relay.create_executor("graph", device=dev, target=target) out = executor.evaluate(func)(data) tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5) @@ -1698,8 +1698,8 @@ def _test_correlation( is_multiply, ) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data1_np, data2_np) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 7e443aa62ee89..bf0a7e4952e57 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -217,9 +217,9 @@ def verify_transpose(dshape, axes): x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") ref_res = np.transpose(x_data, axes=axes) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -272,9 +272,9 @@ def verify_reshape(shape, newshape, oshape): check_grad(func) x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32") ref_res = np.reshape(x_data, oshape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -361,9 +361,9 @@ def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}): func = relay.Function([x, y], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -407,9 +407,9 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"): np_mode = "raise" if mode == "fast" else mode ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, indices_src) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -542,9 +542,9 @@ def verify_full(fill_value, src_shape, dtype): z = relay.full(x, src_shape, dtype) func = relay.Function([x], z) ref_res = np.full(src_shape, fill_value) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(np.array(fill_value, dtype)) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -581,9 +581,9 @@ def verify_full_like(base, fill_value, dtype): func = relay.Function([x, y], z) ref_res = np.full_like(x_data, fill_value) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, np.array(fill_value, dtype)) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -611,9 +611,9 @@ def test_infer_type_leaky_relu(): x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = np.where(x_data > 0, x_data, x_data * 0.1) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) @@ -648,9 +648,9 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"): else: ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data, a_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data, a_data) @@ -692,9 +692,9 @@ def verify_arange(start, stop, step): ref_res = np.arange(start, stop, step).astype(dtype) func = relay.Function([], x) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)() tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -731,9 +731,9 @@ def verify_meshgrid(lengths, indexing="ij"): # Get ref ref_res = np.meshgrid(*input_data, indexing=indexing) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(*input_data) assert len(op_res) == len(ref_res) for i in range(len(op_res)): @@ -757,9 +757,9 @@ def verify_tile(dshape, reps): x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") ref_res = np.tile(x_data, reps=reps) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -775,9 +775,9 @@ def verify_repeat(dshape, repeats, axis): func = relay.Function([x], relay.repeat(x, repeats, axis)) data = np.random.uniform(size=dshape).astype("float32") ref_res = np.repeat(data, repeats, axis) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -800,9 +800,9 @@ def verify_stack(input_expr, relay_args, ref_res, axis): inp_vars = relay.analysis.free_vars(z) func = relay.Function(inp_vars, z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(*relay_args) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -852,9 +852,9 @@ def verify_reverse(dshape, axis): func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") ref_res = np.flip(x_data, axis) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -873,9 +873,9 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res): assert zz.checked_type == x.type_annotation func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -967,9 +967,9 @@ def verify_scatter(dshape, ishape, axis=0): ref_res = ref_scatter(data_np, indices_np, updates_np, axis) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(data_np, indices_np, updates_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -987,10 +987,10 @@ def verify_dynamic_scatter(dshape, ishape, axis=0): ref_res = ref_scatter(data_np, indices_np, updates_np, axis) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(data_np, indices_np, updates_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -1241,9 +1241,9 @@ def verify_gather(data, axis, indices, ref_res): func = relay.Function([d, i], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(data, indices) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -1261,9 +1261,9 @@ def verify_gather_nd(xshape, yshape, y_data): x_data = np.random.uniform(size=xshape).astype("float32") ref_res = x_data[tuple(y_data)] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -1321,9 +1321,9 @@ def verify_unravel_index(indices, shape, dtype): func = relay.Function([x, y], z) ref_res = np.unravel_index(x_data, y_data) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -1367,9 +1367,9 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ assert zz.checked_type == relay.ty.TensorType(output_shape, str(sparse_values_data.dtype)) func = relay.Function(args, d) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) if default_value is None: op_res = intrp.evaluate(func)(sparse_indices_data, sparse_values_data) else: @@ -1707,12 +1707,12 @@ def verify_segment_sum( verify_segment_sum(data_np, segment_ids_np, num_segments) -def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()): +def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()): assert isinstance(data, list) - for target, ctx in target_ctx: + for target, dev in target_device: for kind in ["vm"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(*data) if isinstance(op_res, tvm.runtime.container.ADT): assert len(op_res) == len( @@ -1742,9 +1742,9 @@ def verify_adv_index(data_shape, index_shapes): out = relay.op.adv_index(inputs) func = relay.Function(inputs, out) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(*np_args) tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=1e-5) @@ -1763,7 +1763,7 @@ def verify_adv_index(data_shape, index_shapes): def run_binop_tests( - target, ctx, binop_type: str, gt_func: Callable[..., np.array], identity_value: int + target, dev, binop_type: str, gt_func: Callable[..., np.array], identity_value: int ): def assert_relay_scanop( data_np: np.array, @@ -1782,7 +1782,7 @@ def assert_relay_scanop( func = relay.Function([inp], out) for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(data_np) tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol) @@ -1822,17 +1822,17 @@ def assert_relay_scanop( @tvm.testing.parametrize_targets -def test_cumsum(target, ctx): - run_binop_tests(target, ctx, binop_type="cumsum", gt_func=np.cumsum, identity_value=0) +def test_cumsum(target, dev): + run_binop_tests(target, dev, binop_type="cumsum", gt_func=np.cumsum, identity_value=0) @tvm.testing.parametrize_targets -def test_cumprod(target, ctx): - run_binop_tests(target, ctx, binop_type="cumprod", gt_func=np.cumprod, identity_value=1) +def test_cumprod(target, dev): + run_binop_tests(target, dev, binop_type="cumprod", gt_func=np.cumprod, identity_value=1) @tvm.testing.parametrize_targets -def test_scatter_nd(target, ctx): +def test_scatter_nd(target, dev): def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5): data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype)) indices = relay.var("indices", shape=indices_np.shape, dtype=str(indices_np.dtype)) @@ -1841,7 +1841,7 @@ def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5) func = relay.Function([data, indices], out) for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(data_np, indices_np) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol) @@ -1868,7 +1868,7 @@ def verify_scatter_nd_with_stack(data_np, indices_np, shape, ref_res, rtol=1e-5, for a in indices_np: fargs.append(a) for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(*fargs) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol) @@ -1936,10 +1936,10 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False): else: backends = ["graph", "debug"] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in backends: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) tvm_res = intrp.evaluate()(x_data) np_res = calc_numpy_unique(x_data, is_sorted) num_unique = np_res[3][0] diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py index 114783e55f209..036d4a0f60447 100644 --- a/tests/python/relay/test_op_level4.py +++ b/tests/python/relay/test_op_level4.py @@ -49,8 +49,8 @@ def check_binary_op(opfunc, ref): ref_res = ref(x_data, y_data) func = relay.Function([x, y], z) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -87,8 +87,8 @@ def test_cmp_type(): ref_res = ref(x_data, y_data) func = relay.Function([x, y], z) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -112,8 +112,8 @@ def test_binary_int_broadcast_1(): func = relay.Function([x, y], z) ref_res = ref(x_data, y_data) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -137,8 +137,8 @@ def test_binary_int_broadcast_2(): func = relay.Function([x, y], z) ref_res = ref(x_data, y_data) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, y_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -146,9 +146,9 @@ def test_binary_int_broadcast_2(): @tvm.testing.uses_gpu def test_where(): def run(func, inputs, ref_res): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(*inputs) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -257,9 +257,9 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32") return ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) @@ -351,9 +351,9 @@ def verify_mean_var_std(funcs, shape, axis, keepdims): ref_mean = np.mean(x_data, axis=axis, dtype=dtype, keepdims=keepdims) ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1[0].asnumpy(), ref_mean, rtol=1e-5) tvm.testing.assert_allclose(op_res1[1].asnumpy(), ref_res, rtol=1e-5) @@ -405,8 +405,8 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, if not test_ref: return - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -461,9 +461,9 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True, if not test_ref: return - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor("vm", mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) @@ -513,8 +513,8 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True): x_data = np.random.uniform(size=dshape).astype("float32") v_data = np.random.uniform(size=vshape).astype("float32") ref_res = tvm.topi.testing.strided_set_python(x_data, v_data, begin, end, strides) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(x_data, v_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 929764b6e40ad..2d6c8b50fd371 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -63,9 +63,9 @@ def verify_resize(dshape, scale, method, layout, coord_trans): assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-4) @@ -104,7 +104,7 @@ def test_resize3d_infer_type(): @tvm.testing.parametrize_targets -def test_resize3d(target, ctx): +def test_resize3d(target, dev): def verify_resize(dshape, scale, method, layout): if layout == "NDHWC": size = (dshape[1] * scale, dshape[2] * scale, dshape[3] * scale) @@ -124,7 +124,7 @@ def verify_resize(dshape, scale, method, layout): func = relay.Function([x], z) for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6) @@ -156,9 +156,9 @@ def verify_crop_and_resize( assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([img, bx, bx_idx], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(image_data, boxes, box_indices) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-04) @@ -257,11 +257,11 @@ def verify_multibox_prior( data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") func = relay.Function([x], z) func = run_infer_type(func) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) @@ -316,8 +316,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): assert "score_threshold" in z.astext() func = relay.Function([x], z.astuple()) func = run_infer_type(func) - for target, ctx in tvm.testing.enabled_targets(): - intrp = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp = relay.create_executor("debug", device=dev, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) @@ -390,11 +390,11 @@ def verify_nms( func = run_infer_type(func) func_indices = relay.Function([x0, x1, x2, x3], z_indices) func_indices = run_infer_type(func_indices) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) @@ -581,11 +581,11 @@ def test_default_value(): nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False) func = relay.Function([cls_prob, loc_pred, anchors], nms) func = run_infer_type(func) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors) tvm.testing.assert_allclose(op_res1.asnumpy(), expected_np_out, rtol=1e-5) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors) tvm.testing.assert_allclose(op_res2.asnumpy(), expected_np_out, rtol=1e-5) @@ -674,12 +674,12 @@ def verify_roi_align( sample_ratio=sample_ratio, mode=mode, ) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): print("test on", target) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4) @@ -770,11 +770,11 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale): ref_res = tvm.topi.testing.roi_pool_nchw_python( np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale ) - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4) @@ -798,11 +798,11 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs): if not tvm.testing.device_enabled(target): print("Skip test because %s is not enabled." % target) continue - ctx = tvm.context(target, 0) - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + dev = tvm.device(target, 0) + intrp1 = relay.create_executor("graph", device=dev, target=target) op_res1 = intrp1.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info) tvm.testing.assert_allclose(op_res1.asnumpy(), np_out, rtol=1e-4) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) op_res2 = intrp2.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info) tvm.testing.assert_allclose(op_res2.asnumpy(), np_out, rtol=1e-4) @@ -891,9 +891,9 @@ def verify_yolo_reorg(shape, stride): func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -1024,11 +1024,11 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, la deformable_groups=deformable_groups, groups=groups, ) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target == "cuda" and layout == "NHWC": continue # Cannot run NHWC layout on cuda target, only on llvm for kind in ["graph", "debug"]: - intrp1 = relay.create_executor(kind, ctx=ctx, target=target) + intrp1 = relay.create_executor(kind, device=dev, target=target) op_res1 = intrp1.evaluate(func)(data, offset, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1071,9 +1071,9 @@ def verify_depth_to_space(dshape, block_size, layout, mode): assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4) @@ -1115,9 +1115,9 @@ def verify_space_to_depth(dshape, block_size, layout): assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4) @@ -1170,10 +1170,10 @@ def run_test_dilation2d( ) func = relay.Function([x, w], y) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if target in except_targets: continue - intrp = relay.create_executor("graph", ctx=ctx, target=target) + intrp = relay.create_executor("graph", device=dev, target=target) op_res = intrp.evaluate(func)(indata, kernel) tvm.testing.assert_allclose(op_res.asnumpy(), out, rtol=1e-5, atol=1e-5) @@ -1273,9 +1273,9 @@ def verify_affine_grid(num_batch, target_shape): data_np = np.random.uniform(size=data_shape).astype(dtype) ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp1 = relay.create_executor(kind, ctx=ctx, target=target) + intrp1 = relay.create_executor(kind, device=dev, target=target) op_res1 = intrp1.evaluate(func)(data_np) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1300,9 +1300,9 @@ def verify_grid_sample(data_shape, grid_shape): grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype) ref_res = tvm.topi.testing.grid_sample_nchw_python(data_np, grid_np, method="bilinear") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp1 = relay.create_executor(kind, ctx=ctx, target=target) + intrp1 = relay.create_executor(kind, device=dev, target=target) op_res1 = intrp1.evaluate(func)(data_np, grid_np) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -1327,9 +1327,9 @@ def verify_space_to_batch_nd(dshape, block_shape, paddings): assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4) @@ -1354,9 +1354,9 @@ def verify_batch_to_space_nd(dshape, block_shape, crops): assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") func = relay.Function([x], z) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4) diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py index f4b785f59df84..f0c66247329d0 100644 --- a/tests/python/relay/test_op_level6.py +++ b/tests/python/relay/test_op_level6.py @@ -43,10 +43,10 @@ def verify_sort(shape, axis, is_ascend, is_dyn=False): backends = ["vm", "debug"] else: backends = ["graph", "debug"] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in backends: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) @@ -77,10 +77,10 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False): backends = ["vm", "debug"] else: backends = ["graph", "debug"] - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in backends: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype(dtype), rtol=1e-5) @@ -122,9 +122,9 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype): np_values[i, :] = np_data[i, np_indices[i, :]] np_indices = np_indices.astype(dtype) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: - intrp = relay.create_executor(kind, ctx=ctx, target=target) + intrp = relay.create_executor(kind, device=dev, target=target) op_res = intrp.evaluate(func)(np_data) if ret_type == "both": tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values) diff --git a/tests/python/relay/test_op_qnn_add.py b/tests/python/relay/test_op_qnn_add.py index 6f33a7bb0b51e..b37ddc2c227c4 100644 --- a/tests/python/relay/test_op_qnn_add.py +++ b/tests/python/relay/test_op_qnn_add.py @@ -63,7 +63,7 @@ def test_tflite_same_io_qnn_params(): y_data = y_datas[i] golden_output = golden_outputs[i] - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -111,7 +111,7 @@ def test_tflite_different_io_qnn_params(): y_data = y_datas[i] golden_output = golden_outputs[i] - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -143,7 +143,7 @@ def test_saturation(): y_data = np.array((255, 255, 128, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 129, 0)).reshape((1, 4)) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -169,7 +169,7 @@ def test_saturation(): y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -195,7 +195,7 @@ def test_saturation(): y_data = np.array((255, 255, 127, 0)).reshape((1, 4)) golden_output = np.array((255, 129, 65, 0)).reshape((1, 4)) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -221,7 +221,7 @@ def test_saturation(): y_data = np.array((0, 128, 64, 0)).reshape((1, 4)) golden_output = np.array((255, 255, 132, 0)).reshape((1, 4)) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) diff --git a/tests/python/relay/test_op_qnn_concatenate.py b/tests/python/relay/test_op_qnn_concatenate.py index 55836dc1ee52d..a9207f67fc68d 100644 --- a/tests/python/relay/test_op_qnn_concatenate.py +++ b/tests/python/relay/test_op_qnn_concatenate.py @@ -51,7 +51,7 @@ def test_same_io_qnn_params(): golden_output = np.concatenate((x_data, y_data), axis=axis) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -86,7 +86,7 @@ def test_different_io_qnn_params(): golden_output = np.concatenate((x_data - 2, y_data - 3), axis=axis) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -121,7 +121,7 @@ def test_few_same_io_qnn_params(): golden_output = np.concatenate((x_data + 1, y_data), axis=axis) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -156,7 +156,7 @@ def test_same_i_qnn_params(): golden_output = np.concatenate((x_data + 1, y_data + 1), axis=axis) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) @@ -183,7 +183,7 @@ def test_call_input(): ) func = relay.Function([x], z) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data) np.testing.assert_equal(op_res.asnumpy(), x_data) diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py index a86f9e1c6a800..93776d265a49b 100644 --- a/tests/python/relay/test_op_qnn_conv2_transpose.py +++ b/tests/python/relay/test_op_qnn_conv2_transpose.py @@ -191,7 +191,7 @@ def get_output(func, golden_inputs): golden_data, golden_weight = golden_inputs params = {"kernel": golden_weight} graph, lib, params = relay.build(func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py index 67d4c6f0b8079..87b1f732e2bdd 100644 --- a/tests/python/relay/test_op_qnn_conv2d.py +++ b/tests/python/relay/test_op_qnn_conv2d.py @@ -198,7 +198,7 @@ def get_output(func, golden_inputs): golden_data, golden_weight = golden_inputs params = {"kernel": golden_weight} graph, lib, params = relay.build(func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -722,7 +722,7 @@ def test_tflite_large_irregular(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -767,7 +767,7 @@ def test_tflite_output_multiplier_greater_than_one(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -830,7 +830,7 @@ def test_tflite_anistropic_strides(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py index 923940b5382df..ef74ce3edcf61 100644 --- a/tests/python/relay/test_op_qnn_dense.py +++ b/tests/python/relay/test_op_qnn_dense.py @@ -211,7 +211,7 @@ def qnn_dense_driver(test_configuration): mod = relay.qnn.transform.CanonicalizeOps()(mod) with tvm.transform.PassContext(opt_level=2): graph, lib, params = relay.build(mod, "llvm", params=None) - mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) mod.set_input(quantized_data_name, test_configuration[quantized_data_name]) mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name]) if test_configuration[bias_name] is not None: diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py index 1833458fdb755..85f97a9c073c5 100644 --- a/tests/python/relay/test_op_qnn_dequantize.py +++ b/tests/python/relay/test_op_qnn_dequantize.py @@ -35,7 +35,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data, ax mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -135,12 +135,12 @@ def test_dynamic_dequantize(): mod = tvm.ir.IRModule.from_expr(func) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # TODO: (electriclilies) enable AlterOpLayout when it is fixed with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): lib = relay.build(mod, target=target) - module = graph_runtime.GraphModule(lib["default"](ctx)) + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input(**{"x": data, "scale": scale, "zp": zp}) module.run() diff --git a/tests/python/relay/test_op_qnn_mul.py b/tests/python/relay/test_op_qnn_mul.py index 7a846cbf47175..10721b5732348 100644 --- a/tests/python/relay/test_op_qnn_mul.py +++ b/tests/python/relay/test_op_qnn_mul.py @@ -80,7 +80,7 @@ def test_tflite_same_io_qnn_params(): y_rec = recover(y_data, rhs_scale, rhs_zero_point) golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden)) @@ -134,7 +134,7 @@ def test_tflite_different_io_qnn_params(): y_rec = recover(y_data, rhs_scale, rhs_zero_point) golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden)) @@ -172,7 +172,7 @@ def test_saturation(): golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden)) @@ -206,7 +206,7 @@ def test_saturation(): golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden)) @@ -241,7 +241,7 @@ def test_saturation(): golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point) - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden)) diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py index b300c56121747..e92344f7dcfaa 100644 --- a/tests/python/relay/test_op_qnn_quantize.py +++ b/tests/python/relay/test_op_qnn_quantize.py @@ -39,7 +39,7 @@ def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_ mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -150,12 +150,12 @@ def test_dynamic_quantize(): mod = tvm.ir.IRModule.from_expr(func) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): # TODO: (electriclilies) enable AlterOpLayout when it is fixed with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): lib = relay.build(mod, target=target) - module = graph_runtime.GraphModule(lib["default"](ctx)) + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input(**{"x": data, "scale": scale, "zp": zp}) module.run() diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py index f40a087114518..5aa3f7f7fdc8a 100644 --- a/tests/python/relay/test_op_qnn_requantize.py +++ b/tests/python/relay/test_op_qnn_requantize.py @@ -28,7 +28,7 @@ def verify(mod, goldens): with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) golden_data, golden_output = goldens - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input("quantized_data", golden_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py index a9333c916561e..266e9d73b3cbc 100644 --- a/tests/python/relay/test_op_qnn_simulated_dequantize.py +++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py @@ -39,7 +39,7 @@ def dequantize_test_driver(in_dtype, quant_args, axis, in_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py index c0fa0648d8790..8a15a037d8bae 100644 --- a/tests/python/relay/test_op_qnn_simulated_quantize.py +++ b/tests/python/relay/test_op_qnn_simulated_quantize.py @@ -47,7 +47,7 @@ def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_op_qnn_subtract.py b/tests/python/relay/test_op_qnn_subtract.py index a76b05c31564c..fb55cdc948441 100644 --- a/tests/python/relay/test_op_qnn_subtract.py +++ b/tests/python/relay/test_op_qnn_subtract.py @@ -52,7 +52,7 @@ def qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp, data_dty x_data = x_datas[i] y_data = y_datas[i] golden_output = golden_outputs[i] - intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm") + intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm") op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_equal(op_res.asnumpy(), golden_output) diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py index 29e0b5c0463b0..7bc4ab422a64c 100644 --- a/tests/python/relay/test_param_dict.py +++ b/tests/python/relay/test_param_dict.py @@ -75,11 +75,11 @@ def verify_graph_runtime(remote, target, shape, dtype): lib.save(path_dso) remote.upload(path_dso) lib = remote.load_module("dev_lib.o") - ctx = remote.cpu(0) - mod = graph_runtime.create(graph, lib, ctx) + dev = remote.cpu(0) + mod = graph_runtime.create(graph, lib, dev) mod.load_params(runtime.save_param_dict(params)) mod.run() - out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) + out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, device=dev)) tvm.testing.assert_allclose(x_in + 1, out.asnumpy()) print("Test RPC connection to PowerPC...") diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 41186884bdb20..aeaf1f89c388d 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -757,10 +757,10 @@ def expected(): mod_before = transform.InferType()(mod_before) mod_new = transform.InferType()(mod_new) with relay.build_config(opt_level=3): - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "debug", "vm"]: - ex_before = relay.create_executor(kind, mod=mod_before, ctx=ctx, target=target) - ex_new = relay.create_executor(kind, mod=mod_new, ctx=ctx, target=target) + ex_before = relay.create_executor(kind, mod=mod_before, device=dev, target=target) + ex_new = relay.create_executor(kind, mod=mod_new, device=dev, target=target) np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32") np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32") result_before = ex_before.evaluate()(np_data, np_weight) diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py index ce86cc603d6d1..52deb4e89ef97 100644 --- a/tests/python/relay/test_pass_annotate_target.py +++ b/tests/python/relay/test_pass_annotate_target.py @@ -29,7 +29,7 @@ def check_result( - mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None + mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None ): if sys.platform == "win32": print("Skip test on Windows for now") @@ -56,7 +56,7 @@ def check_vm_result(): code, lib = exe.save() lib = update_lib(lib) exe = runtime.vm.Executable.load_exec(code, lib) - vm = runtime.vm.VirtualMachine(exe, ctx) + vm = runtime.vm.VirtualMachine(exe, device) out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) @@ -64,13 +64,13 @@ def check_graph_runtime_result(): with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): json, lib, param = relay.build(mod, target=target, params=params) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx) + rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.set_input(**param) rt_mod.run() - out = tvm.nd.empty(out_shape, ctx=ctx) + out = tvm.nd.empty(out_shape, device=device) out = rt_mod.get_output(0, out) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) @@ -144,7 +144,7 @@ def test_run(): i_data = np.random.uniform(0, 1, ishape).astype(dtype) w1_data = np.random.uniform(0, 1, w1shape).astype(dtype) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu()) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()) ref_res = ref_ex.evaluate()(i_data, w1_data) check_result( @@ -171,7 +171,7 @@ def test_extern_dnnl_mobilenet(): i_data = np.random.uniform(0, 1, ishape).astype(dtype) ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32") - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)) ref_res = ref_ex.evaluate()(i_data, **params) check_result(mod, {"data": i_data}, (1, 1000), ref_res.asnumpy(), tol=1e-5, params=params) diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py index ff68d489c7c54..70f1a0aa52e5e 100644 --- a/tests/python/relay/test_pass_annotation.py +++ b/tests/python/relay/test_pass_annotation.py @@ -36,7 +36,7 @@ def check_graph_runtime( ): with tvm.transform.PassContext(opt_level=opt_level, config=config): graph, lib, new_params = relay.build(func, target, params=params) - contexts = [tvm.cpu(0), tvm.context(device)] + contexts = [tvm.cpu(0), tvm.device(device)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] @@ -53,8 +53,8 @@ def check_vm_runtime(target, ref_res, device, func, params, config, opt_level, e mod = tvm.IRModule() mod["main"] = func exe = relay.vm.compile(mod, target) - ctx = [tvm.cpu(0), tvm.context(device)] - vm = tvm.runtime.vm.VirtualMachine(exe, ctx) + dev = [tvm.cpu(0), tvm.device(device)] + vm = tvm.runtime.vm.VirtualMachine(exe, dev) res = vm.invoke("main", **params) tvm.testing.assert_allclose(res.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) @@ -69,28 +69,28 @@ def run_opt_pass(expr, passes): def test_redundant_annotation(): - ctx1 = tvm.context(1) - ctx2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) def annotated(): add = relay.add(x, y) - _add1 = relay.annotation.on_device(add, ctx2) - _add2 = relay.annotation.on_device(add, ctx2) + _add1 = relay.annotation.on_device(add, dev2) + _add2 = relay.annotation.on_device(add, dev2) sub1 = relay.subtract(_add1, z) sub2 = relay.subtract(_add2, z) func = relay.Function([x, y, z], relay.Tuple([sub1, sub2])) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type)) return func def expected(): add = relay.add(x, y) - copy_add_sub1 = relay.device_copy(add, ctx2, ctx1) + copy_add_sub1 = relay.device_copy(add, dev2, dev1) sub1 = relay.subtract(copy_add_sub1, z) - copy_add_sub2 = relay.device_copy(add, ctx2, ctx1) + copy_add_sub2 = relay.device_copy(add, dev2, dev1) sub2 = relay.subtract(copy_add_sub2, z) func = relay.Function([x, y, z], relay.Tuple([sub1, sub2])) return func @@ -101,23 +101,23 @@ def expected(): def test_annotate_expr(): - ctx1 = tvm.context(1) - ctx2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) def annotated(): add = relay.add(x, y) - _add = relay.annotation.on_device(add, ctx1) + _add = relay.annotation.on_device(add, dev1) sub = relay.subtract(_add, z) - _sub = relay.annotation.on_device(sub, ctx2) - expr = run_opt_pass(_sub, transform.RewriteAnnotatedOps(ctx1.device_type)) + _sub = relay.annotation.on_device(sub, dev2) + expr = run_opt_pass(_sub, transform.RewriteAnnotatedOps(dev1.device_type)) return expr def expected(): add = relay.add(x, y) - copy_add_sub = relay.device_copy(add, ctx1, ctx2) + copy_add_sub = relay.device_copy(add, dev1, dev2) sub = relay.subtract(copy_add_sub, z) return sub @@ -127,20 +127,20 @@ def expected(): def test_annotate_all(): - ctx1 = tvm.context(1) - ctx2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) def annotated(): add = relay.add(x, y) - _add = relay.annotation.on_device(add, ctx2) + _add = relay.annotation.on_device(add, dev2) sub = relay.subtract(_add, z) - _sub = relay.annotation.on_device(sub, ctx2) + _sub = relay.annotation.on_device(sub, dev2) func = relay.Function([x, y, z], _sub) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type)) return func def expected(): @@ -155,8 +155,8 @@ def expected(): def test_annotate_none(): - ctx1 = tvm.context(1) - ctx2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) x = relay.var("x", shape=(3,)) y = relay.var("y", shape=(3,)) z = relay.var("z", shape=(3,)) @@ -165,7 +165,7 @@ def annotated(): add = relay.add(x, y) sub = relay.subtract(add, z) func = relay.Function([x, y, z], sub) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type)) return func def expected(): @@ -200,8 +200,8 @@ def test_conv_network(): weight = relay.var("weight", shape=(64, 64, 3, 3)) data1 = relay.var("data1", shape=dshape) data2 = relay.var("data2", shape=dshape) - dev1 = tvm.context(1) - dev2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) def original(): conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) @@ -210,7 +210,7 @@ def original(): conv2d_3 = relay.nn.conv2d(add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1)) func = relay.Function([data1, data2, weight], conv2d_3) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.context(3).device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.device(3).device_type)) return func def annotated(): @@ -224,7 +224,7 @@ def annotated(): _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2) func = relay.Function([data1, data2, weight], _conv2d_3) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.context(3).device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.device(3).device_type)) return func class ScheduleConv2d(ExprMutator): @@ -303,10 +303,10 @@ def test_propogation(): | tan 1 """ - ctx1 = tvm.context(1) - ctx2 = tvm.context(2) + dev1 = tvm.device(1) + dev2 = tvm.device(2) - expected_dev_type = {"log": ctx1, "log2": ctx2, "log10": ctx2, "add": ctx2, "tan": ctx1} + expected_dev_type = {"log": dev1, "log2": dev2, "log10": dev2, "add": dev2, "tan": dev1} x = relay.var("x", shape=(3,)) @@ -322,17 +322,17 @@ def annotated(): tan = relay.tan(_add) _tan = relay.annotation.on_device(tan, expected_dev_type["tan"]) - func = run_opt_pass(_tan, transform.RewriteAnnotatedOps(ctx1.device_type)) + func = run_opt_pass(_tan, transform.RewriteAnnotatedOps(dev1.device_type)) return func def expected(): log = relay.log(x) - _log_left = relay.device_copy(log, ctx1, ctx2) - _log_right = relay.device_copy(log, ctx1, ctx2) + _log_left = relay.device_copy(log, dev1, dev2) + _log_right = relay.device_copy(log, dev1, dev2) log2 = relay.log2(_log_left) log10 = relay.log10(_log_right) add = relay.add(log2, log10) - _add = relay.device_copy(add, ctx2, ctx1) + _add = relay.device_copy(add, dev2, dev1) tan = relay.tan(_add) func = run_opt_pass(tan, transform.InferType()) @@ -344,9 +344,9 @@ def expected(): smap = relay.backend._backend.GraphPlanMemory(annotated_expr) for expr, storage_dev_type in smap.items(): - # x is ctx1 as output is ctx1 + # x is dev1 as output is dev1 if isinstance(expr, tvm.relay.expr.Var): - assert storage_dev_type[1][0] == ctx1.device_type + assert storage_dev_type[1][0] == dev1.device_type else: # device_copy op should be its dst_dev_type if isinstance(expr.attrs, tvm.relay.op.op_attrs.DeviceCopyAttrs): @@ -390,32 +390,32 @@ def get_func(): def test_fuse_log_add(device, tgt): """ Only log and add are fused.""" - fallback_device = tvm.context("cpu") + fallback_device = tvm.device("cpu") target = {"cpu": "llvm", device: tgt} - cpu_ctx = fallback_device - dev_ctx = tvm.context(device) + cpu_dev = fallback_device + dev_dev = tvm.device(device) def annotated(): add = relay.add(x, y) sqrt = relay.sqrt(add) - _sqrt = relay.annotation.on_device(sqrt, dev_ctx) + _sqrt = relay.annotation.on_device(sqrt, dev_dev) log = relay.log(add) subtract = relay.subtract(_sqrt, log) exp = relay.exp(subtract) - _exp = relay.annotation.on_device(exp, dev_ctx) + _exp = relay.annotation.on_device(exp, dev_dev) func = relay.Function([x, y], _exp) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type)) return func def expected(): add = relay.add(x, y) - copy_add_sqrt = relay.device_copy(add, cpu_ctx, dev_ctx) + copy_add_sqrt = relay.device_copy(add, cpu_dev, dev_dev) sqrt = relay.sqrt(copy_add_sqrt) log = relay.log(add) - copy_sqrt_subtract = relay.device_copy(sqrt, dev_ctx, cpu_ctx) + copy_sqrt_subtract = relay.device_copy(sqrt, dev_dev, cpu_dev) subtract = relay.subtract(copy_sqrt_subtract, log) - copy_sub_exp = relay.device_copy(subtract, cpu_ctx, dev_ctx) + copy_sub_exp = relay.device_copy(subtract, cpu_dev, dev_dev) exp = relay.exp(copy_sub_exp) func = relay.Function([x, y], exp) @@ -423,8 +423,8 @@ def expected(): annotated_func = annotated() expected_func = expected() - ctx = tvm.context(device, 0) - dev_idx = ctx.device_type + dev = tvm.device(device, 0) + dev_idx = dev.device_type expected_index = [1, 1, 1, dev_idx, dev_idx, 1, 1, dev_idx, dev_idx] check_annotated_graph(annotated_func, expected_func) opt_level = 1 @@ -439,25 +439,25 @@ def expected(): def test_fuse_all(device, tgt): """Fuse all operators.""" - fallback_device = tvm.context("cpu") + fallback_device = tvm.device("cpu") target = {"cpu": "llvm", device: tgt} - cpu_ctx = fallback_device - dev_ctx = tvm.context(device) + cpu_dev = fallback_device + dev_dev = tvm.device(device) def annotated(): add = relay.add(x, y) - _add = relay.annotation.on_device(add, dev_ctx) + _add = relay.annotation.on_device(add, dev_dev) sqrt = relay.sqrt(_add) - _sqrt = relay.annotation.on_device(sqrt, dev_ctx) + _sqrt = relay.annotation.on_device(sqrt, dev_dev) log = relay.log(_add) - _log = relay.annotation.on_device(log, dev_ctx) + _log = relay.annotation.on_device(log, dev_dev) subtract = relay.subtract(_sqrt, _log) - _subtract = relay.annotation.on_device(subtract, dev_ctx) + _subtract = relay.annotation.on_device(subtract, dev_dev) exp = relay.exp(_subtract) - _exp = relay.annotation.on_device(exp, dev_ctx) + _exp = relay.annotation.on_device(exp, dev_dev) func = relay.Function([x, y], _exp) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type)) return func annotated_func = annotated() @@ -470,10 +470,10 @@ def annotated(): check_vm_runtime(target, ref_res, device, annotated_func, params, config, opt_level) def test_fallback_exp(device, tgt): - fallback_device = tvm.context("cpu") + fallback_device = tvm.device("cpu") target = {"cpu": "llvm", device: tgt} - cpu_ctx = fallback_device - dev_ctx = tvm.context(device) + cpu_dev = fallback_device + dev_dev = tvm.device(device) def annotated(): add = relay.add(x, y) @@ -481,10 +481,10 @@ def annotated(): log = relay.log(add) subtract = relay.subtract(sqrt, log) exp = relay.exp(subtract) - _exp = relay.annotation.on_device(exp, cpu_ctx) + _exp = relay.annotation.on_device(exp, cpu_dev) func = relay.Function([x, y], _exp) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_ctx.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_dev.device_type)) return func def expected(): @@ -492,7 +492,7 @@ def expected(): sqrt = relay.sqrt(add) log = relay.log(add) subtract = relay.subtract(sqrt, log) - copy_sub_exp = relay.device_copy(subtract, dev_ctx, cpu_ctx) + copy_sub_exp = relay.device_copy(subtract, dev_dev, cpu_dev) exp = relay.exp(copy_sub_exp) func = relay.Function([x, y], exp) @@ -500,8 +500,8 @@ def expected(): annotated_func = annotated() expected_func = expected() - ctx = tvm.context(device, 0) - dev_idx = ctx.device_type + dev = tvm.device(device, 0) + dev_idx = dev.device_type expected_index = [dev_idx, dev_idx, dev_idx, 1, 1] opt_level = 1 config = {"relay.fallback_device_type": fallback_device.device_type} @@ -550,26 +550,26 @@ def run_unpropagatable_graph(dev, tgt): tmp_mul = np.multiply(c_data, d_data) ref_res = np.subtract(tmp_add, tmp_mul) - fallback_device = tvm.context("cpu") + fallback_device = tvm.device("cpu") target = {"cpu": "llvm", dev: tgt} - cpu_ctx = fallback_device - dev_ctx = tvm.context(dev) + cpu_dev = fallback_device + dev_dev = tvm.device(dev) def annotated(): add = relay.add(a, b) - _add = relay.annotation.on_device(add, dev_ctx) + _add = relay.annotation.on_device(add, dev_dev) mul = relay.multiply(c, d) - _mul = relay.annotation.on_device(mul, cpu_ctx) + _mul = relay.annotation.on_device(mul, cpu_dev) sub = relay.subtract(_add, _mul) - _sub = relay.annotation.on_device(sub, dev_ctx) + _sub = relay.annotation.on_device(sub, dev_dev) func = relay.Function([a, b, c, d], _sub) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_ctx.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_dev.device_type)) return func def expected(): add = relay.add(a, b) mul = relay.multiply(c, d) - copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx) + copy_mul_sub = relay.device_copy(mul, cpu_dev, dev_dev) sub = relay.subtract(add, copy_mul_sub) func = relay.Function([a, b, c, d], sub) return func @@ -617,14 +617,14 @@ def test_check_run_cuda(): @tvm.testing.requires_cuda def test_tuple_get_item(): dev = "cuda" - cpu_ctx = tvm.cpu(0) - gpu_ctx = tvm.context(dev) + cpu_dev = tvm.cpu(0) + gpu_dev = tvm.device(dev) def expected(): x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32")) split = relay.op.split(x, 3) - elem0 = relay.device_copy(split[0], gpu_ctx, cpu_ctx) - elem1 = relay.device_copy(split[1], gpu_ctx, cpu_ctx) + elem0 = relay.device_copy(split[0], gpu_dev, cpu_dev) + elem1 = relay.device_copy(split[1], gpu_dev, cpu_dev) sub = elem0 - elem1 func = relay.Function(relay.analysis.free_vars(sub), sub) return func @@ -633,11 +633,11 @@ def annotated(): x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32")) split = relay.op.split(x, 3) split = split.astuple() - split = relay.annotation.on_device(split, gpu_ctx) + split = relay.annotation.on_device(split, gpu_dev) split = relay.TupleWrapper(split, 3) sub = split[0] - split[1] func = relay.Function(relay.analysis.free_vars(sub), sub) - func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type)) + func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type)) return func annotated_func = annotated() diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py index 31f5ac6e71b1d..326416f3c501b 100644 --- a/tests/python/relay/test_pass_auto_quantize.py +++ b/tests/python/relay/test_pass_auto_quantize.py @@ -126,7 +126,7 @@ def gen_rand_tvm(tt, low, high): data_np = np.random.uniform(low, high, size=get_const_tuple(tt.shape)).astype(tt.dtype) else: assert False, "unknown dtype" - return tvm.nd.array(data_np, ctx=tvm.cpu(0)) + return tvm.nd.array(data_np, device=tvm.cpu(0)) def verify_partition_fails(mod, params): @@ -155,7 +155,7 @@ def verify_partition(mod, params): params = [gen_rand_tvm(param.type_annotation, 0, 1) for param in partitioned_mod["main"].params] def _eval_mod(mod): - vm = relay.create_executor("vm", ctx=tvm.cpu(0), target="llvm", mod=mod) + vm = relay.create_executor("vm", device=tvm.cpu(0), target="llvm", mod=mod) return vm.evaluate()(*params) partitioned_mod_result = _eval_mod(partitioned_mod) diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py index c9e047a38540e..b9d3a8ef357c0 100644 --- a/tests/python/relay/test_pass_dynamic_to_static.py +++ b/tests/python/relay/test_pass_dynamic_to_static.py @@ -37,10 +37,10 @@ def run_opt_pass(expr, opt_pass): def verify_func(func, data, ref_res, rtol=1e-5, atol=1e-7): assert isinstance(data, list) - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): for kind in ["graph", "vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(*data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol) @@ -176,12 +176,12 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype): assert isinstance(zz, relay.Call) assert zz.op == relay.op.get("topk") - for target, ctx in tvm.testing.enabled_targets(): + for target, dev in tvm.testing.enabled_targets(): if "llvm" not in target: continue for kind in ["graph", "vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func2) - intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(np_data) if ret_type == "both": tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values) diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py index 302a2b91bb8fb..a3f82dd6d2709 100644 --- a/tests/python/relay/test_pass_fold_explicit_padding.py +++ b/tests/python/relay/test_pass_fold_explicit_padding.py @@ -70,8 +70,8 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout): mod2 = tvm.IRModule.from_expr(zz) with tvm.transform.PassContext(): - ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm") - ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm") + ex1 = relay.create_executor("vm", mod=mod1, device=tvm.cpu(), target="llvm") + ex2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm") x_np = np.random.rand(*shape).astype("float32") w_np = np.random.rand(*wshape).astype("float32") result1 = ex1.evaluate()(x_np, w_np) diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py index 30ee29525daa7..9b7471f9a5edf 100644 --- a/tests/python/relay/test_pass_fuse_ops.py +++ b/tests/python/relay/test_pass_fuse_ops.py @@ -712,7 +712,7 @@ def expected(): orig = before() m = fuse2(tvm.IRModule.from_expr(orig)) - for tgt, ctx in tvm.testing.enabled_targets(): + for tgt, dev in tvm.testing.enabled_targets(): relay.build(m, tgt) after = run_opt_pass(expected(), transform.InferType()) assert tvm.ir.structural_equal(m["main"], after) @@ -775,7 +775,7 @@ def test_fuse_dynamic_squeeze_slice_take(): take = relay.op.take(strided_slice, take_val, axis=0) mod = tvm.IRModule.from_expr(take) - ex = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(), target="llvm") + ex = relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm") result = ex.evaluate()(*input_data) diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py index 7e2282809f765..5a29d1acd171c 100644 --- a/tests/python/relay/test_pass_manager.py +++ b/tests/python/relay/test_pass_manager.py @@ -178,9 +178,9 @@ def test_pass_run(): x_nd = get_rand(shape, dtype) y_nd = get_rand(shape, dtype) ref_res = x_nd.asnumpy() + y_nd.asnumpy() - for target, ctx in tvm.testing.enabled_targets(): - exe1 = relay.create_executor("graph", ctx=ctx, target=target) - exe2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + exe1 = relay.create_executor("graph", device=dev, target=target) + exe2 = relay.create_executor("debug", device=dev, target=target) res1 = exe1.evaluate(new_add)(x_nd, y_nd) tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5) res2 = exe2.evaluate(new_add)(x_nd, y_nd) @@ -275,9 +275,9 @@ def test_pass_run(): # Execute the add function. x_nd = get_rand(shape, dtype) ref_res = np.log(x_nd.asnumpy() * 2) - for target, ctx in tvm.testing.enabled_targets(): - exe1 = relay.create_executor("graph", ctx=ctx, target=target) - exe2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + exe1 = relay.create_executor("graph", device=dev, target=target) + exe2 = relay.create_executor("debug", device=dev, target=target) res1 = exe1.evaluate(new_log)(x_nd) tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5) res2 = exe2.evaluate(new_log)(x_nd) @@ -437,9 +437,9 @@ def test_multiple_passes(): x_nd = get_rand(shape, dtype) y_nd = get_rand(shape, dtype) ref_res = np.subtract(x_nd.asnumpy() * 2, y_nd.asnumpy() * 2) - for target, ctx in tvm.testing.enabled_targets(): - exe1 = relay.create_executor("graph", ctx=ctx, target=target) - exe2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + exe1 = relay.create_executor("graph", device=dev, target=target) + exe2 = relay.create_executor("debug", device=dev, target=target) res1 = exe1.evaluate(new_sub)(x_nd, y_nd) tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5) res2 = exe2.evaluate(new_sub)(x_nd, y_nd) @@ -448,9 +448,9 @@ def test_multiple_passes(): # Execute the updated abs function. x_nd = get_rand((5, 10), dtype) ref_res = np.abs(x_nd.asnumpy() * 2) - for target, ctx in tvm.testing.enabled_targets(): - exe1 = relay.create_executor("graph", ctx=ctx, target=target) - exe2 = relay.create_executor("debug", ctx=ctx, target=target) + for target, dev in tvm.testing.enabled_targets(): + exe1 = relay.create_executor("graph", device=dev, target=target) + exe2 = relay.create_executor("debug", device=dev, target=target) res1 = exe1.evaluate(new_abs)(x_nd) tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5) res2 = exe2.evaluate(new_abs)(x_nd) diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py index 45749c31f38fa..57286670f06d2 100644 --- a/tests/python/relay/test_pass_partial_eval.py +++ b/tests/python/relay/test_pass_partial_eval.py @@ -30,8 +30,8 @@ def check_eval(expr, expected_result, mod=None, rtol=1e-07): - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") result = intrp.evaluate(expr) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol) diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index d8f674eeff344..a5bc3ff2cead6 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -43,7 +43,7 @@ def __init__(self, op_list, compiler): self.op_list = op_list self.compiler = compiler - def transform_function(self, func, mod, ctx): + def transform_function(self, func, mod, dev): annotator = self @@ -173,7 +173,7 @@ def visit_call(self, call): def check_result( - mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None + mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None ): if sys.platform == "win32": print("Skip test on Windows for now") @@ -201,7 +201,7 @@ def check_vm_result(): code, lib = exe.save() lib = update_lib(lib) exe = runtime.vm.Executable.load_exec(code, lib) - vm = runtime.vm.VirtualMachine(exe, ctx) + vm = runtime.vm.VirtualMachine(exe, device) outs = vm.run(**map_inputs) outs = outs if isinstance(outs, runtime.container.ADT) else [outs] results = result if isinstance(result, list) else [result] @@ -213,7 +213,7 @@ def check_graph_runtime_result(): with tvm.transform.PassContext(opt_level=3): json, lib, param = relay.build(mod, target=target, params=params) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx) + rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -224,7 +224,7 @@ def check_graph_runtime_result(): results = result if isinstance(result, list) else [result] for idx, shape in enumerate(out_shapes): - out = tvm.nd.empty(shape, ctx=ctx) + out = tvm.nd.empty(shape, device=device) out = rt_mod.get_output(idx, out) tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol) @@ -295,7 +295,7 @@ def test_multi_node_compiler(): def test_extern_ccompiler_single_op(): @transform.function_pass(opt_level=0) class MyAnnotator: - def transform_function(self, func, mod, ctx): + def transform_function(self, func, mod, dev): class Annotator(tvm.relay.ExprMutator): def visit_call(self, call): new_args = [] @@ -456,7 +456,7 @@ def get_func(): i_data = np.random.uniform(0, 1, ishape).astype(dtype) w1_data = np.random.uniform(0, 1, w1shape).astype(dtype) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu()) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()) ref_res = ref_ex.evaluate()(i_data, w1_data) check_result( mod, {"data": i_data, "weight1": w1_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5 @@ -476,7 +476,7 @@ def test_extern_dnnl_mobilenet(): mod = transform.PartitionGraph()(mod) i_data = np.random.uniform(0, 1, ishape).astype(dtype) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)) ref_res = ref_ex.evaluate()(i_data, **params) compile_engine.get().clear() @@ -917,7 +917,7 @@ def test_partition_mobilenet(): def test_exec(mod, params, ref_mod, ref_params, out_shape): ishape = (1, 3, 224, 224) i_data = np.random.randn(*ishape).astype(np.float32) - ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) + ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0)) ref_res = ref_ex.evaluate()(i_data, **ref_params) compile_engine.get().clear() diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py index 72325e537c0ed..e7aee5fae00be 100644 --- a/tests/python/relay/test_pass_to_a_normal_form.py +++ b/tests/python/relay/test_pass_to_a_normal_form.py @@ -36,8 +36,8 @@ def run_opt_pass(expr, passes): def check_eval(expr, expected_result, mod=None, rtol=1e-07): - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") result = intrp.evaluate(expr) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol) @@ -149,8 +149,8 @@ def test_nat_add(): p.mod.import_from_std("nat.rly") nat, z, s = p.mod.get_type("nat") add = p.mod.get_global_var("nat_add") - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat()) assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2 expr = add(s(z()), s(z())) diff --git a/tests/python/relay/test_pass_to_basic_block_normal_form.py b/tests/python/relay/test_pass_to_basic_block_normal_form.py index a52d51ad49606..2085c8a2799c9 100644 --- a/tests/python/relay/test_pass_to_basic_block_normal_form.py +++ b/tests/python/relay/test_pass_to_basic_block_normal_form.py @@ -38,8 +38,8 @@ def run_opt_pass(expr, passes): def check_eval(expr, expected_result, mod=None, rtol=1e-07): - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") result = intrp.evaluate(expr) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol) @@ -266,8 +266,8 @@ def test_nat_add(): p.mod.import_from_std("nat.rly") nat, z, s = p.mod.get_type("nat") add = p.mod.get_global_var("nat_add") - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat()) assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2 expr = add(s(z()), s(z())) diff --git a/tests/python/relay/test_pass_to_graph_normal_form.py b/tests/python/relay/test_pass_to_graph_normal_form.py index 88d6829bd16ec..f4c1a32b2566f 100644 --- a/tests/python/relay/test_pass_to_graph_normal_form.py +++ b/tests/python/relay/test_pass_to_graph_normal_form.py @@ -33,8 +33,8 @@ def check_eval(expr, args, expected_result, mod=None, rtol=1e-07): if mod is None: mod = tvm.IRModule() - ctx = tvm.context("llvm", 0) - intrp = create_executor(mod=mod, ctx=ctx, target="llvm") + dev = tvm.device("llvm", 0) + intrp = create_executor(mod=mod, device=dev, target="llvm") result = intrp.evaluate(expr)(*args) np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol) diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py index 2109d3b30a82e..ba4fdc466ecc7 100644 --- a/tests/python/relay/test_prng.py +++ b/tests/python/relay/test_prng.py @@ -22,18 +22,17 @@ @tvm.testing.parametrize_targets -def test_threefry_repeatability(target, ctx): - target, ctx = "llvm", tvm.cpu(0) +def test_threefry_repeatability(target, dev): key1 = tvm.relay.random.threefry_key(1) rand1 = tvm.relay.random.threefry_generate(key1, (12,)) out_key1, out1 = tvm.relay.create_executor( - "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, ctx=ctx + "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, device=dev ).evaluate()() key2 = tvm.relay.random.threefry_key(1) rand2 = tvm.relay.random.threefry_generate(key2, (12,)) out_key2, out2 = tvm.relay.create_executor( - "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, ctx=ctx + "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, device=dev ).evaluate()() assert ( @@ -46,7 +45,7 @@ def test_threefry_repeatability(target, ctx): @tvm.testing.parametrize_targets -def test_threefry_split(target, ctx): +def test_threefry_split(target, dev): key = tvm.relay.random.threefry_key(1) left, right = tvm.relay.TupleWrapper(tvm.relay.random.threefry_split(key), 2) _, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(left, (16,)), 2) @@ -55,7 +54,7 @@ def test_threefry_split(target, ctx): "vm", tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))), target=target, - ctx=ctx, + device=dev, ).evaluate()() assert ( @@ -64,7 +63,7 @@ def test_threefry_split(target, ctx): @tvm.testing.parametrize_targets -def test_threefry_sequential_generate(target, ctx): +def test_threefry_sequential_generate(target, dev): key = tvm.relay.random.threefry_key(1) key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2) _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2) @@ -72,7 +71,7 @@ def test_threefry_sequential_generate(target, ctx): "vm", tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))), target=target, - ctx=ctx, + device=dev, ).evaluate()() assert ( @@ -132,11 +131,11 @@ def test_threefry_generate_incorrect_out_size(): "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=tvm.target.Target("llvm"), - ctx=tvm.context("cpu"), + device=tvm.device("cpu"), ).evaluate()() if __name__ == "__main__": - test_threefry_repeatability(tvm.target.Target("llvm"), tvm.context("cpu")) - test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu")) - test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_repeatability(tvm.target.Target("llvm"), tvm.device("cpu")) + test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu")) + test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.device("cpu")) diff --git a/tests/python/relay/test_simplify_fc_transpose.py b/tests/python/relay/test_simplify_fc_transpose.py index d5d195d3ff1d6..ce93a68c73216 100644 --- a/tests/python/relay/test_simplify_fc_transpose.py +++ b/tests/python/relay/test_simplify_fc_transpose.py @@ -33,9 +33,9 @@ def run_func(func, params, x): from tvm.contrib import graph_runtime - ctx = tvm.cpu(0) + dev = tvm.cpu(0) dtype = "float32" - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tests/python/relay/test_sparse_dense_convert.py b/tests/python/relay/test_sparse_dense_convert.py index e3644e9704fc4..4eaaf769ff11a 100644 --- a/tests/python/relay/test_sparse_dense_convert.py +++ b/tests/python/relay/test_sparse_dense_convert.py @@ -54,9 +54,9 @@ def run_func(func, params, x): from tvm.contrib import graph_runtime - ctx = tvm.cpu(0) + dev = tvm.cpu(0) dtype = "float32" - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, dev) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) m.set_input(**new_params) diff --git a/tests/python/relay/test_tensor_array.py b/tests/python/relay/test_tensor_array.py index 76e9d4a6d8a04..3b950b45f1e20 100644 --- a/tests/python/relay/test_tensor_array.py +++ b/tests/python/relay/test_tensor_array.py @@ -60,11 +60,10 @@ def vmobj_to_list(mod, o, dtype="float32"): def check_tensor_array(ta_mod, ref_res, *args, dtype="float32", rtol=1e-5): for kind in ["debug", "vm"]: - for target, ctx in [("llvm", tvm.cpu(0))]: # testing.enabled_targets(): - # for target, ctx in testing.enabled_targets(): - if kind == "debug" and ctx.device_type != tvm.cpu().device_type: + for target, dev in [("llvm", tvm.cpu(0))]: # testing.enabled_targets(): + if kind == "debug" and dev.device_type != tvm.cpu().device_type: continue - ex = relay.create_executor(kind, mod=ta_mod, ctx=ctx, target=target) + ex = relay.create_executor(kind, mod=ta_mod, device=dev, target=target) result = ex.evaluate()(*args) got = vmobj_to_list(ta_mod, result, dtype) tvm.testing.assert_allclose(ref_res, got, rtol=rtol, atol=rtol) diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py index 975070ad1aaab..4ecd0d9189ea5 100644 --- a/tests/python/relay/test_vm.py +++ b/tests/python/relay/test_vm.py @@ -40,13 +40,13 @@ def check_result(args, expected_result, mod=None): expected_result: The expected result of running the expression. """ - for target, ctx in tvm.testing.enabled_targets(): - vm = relay.create_executor("vm", ctx=ctx, target=target, mod=mod) + for target, dev in tvm.testing.enabled_targets(): + vm = relay.create_executor("vm", device=dev, target=target, mod=mod) rts_result = vm.evaluate()(*args) tvm.testing.assert_allclose(expected_result, rts_result.asnumpy()) -def veval(f, *args, ctx=tvm.cpu(), target="llvm"): +def veval(f, *args, device=tvm.cpu(), target="llvm"): if isinstance(f, relay.Expr): mod = tvm.IRModule() mod["main"] = f @@ -54,7 +54,7 @@ def veval(f, *args, ctx=tvm.cpu(), target="llvm"): assert isinstance(f, tvm.IRModule), "expected expression or module" mod = f exe = relay.vm.compile(mod, target) - vm = runtime.vm.VirtualMachine(exe, ctx) + vm = runtime.vm.VirtualMachine(exe, device) return vm.invoke("main", *args) @@ -80,8 +80,8 @@ def test_split(): 12, ).astype("float32") ref_res = np.split(x_data, 3, axis=0) - for tgt, ctx in tvm.testing.enabled_targets(): - res = veval(f, x_data, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + res = veval(f, x_data, device=dev, target=tgt) for i in range(3): tvm.testing.assert_allclose(res[i].asnumpy(), ref_res[i]) @@ -96,8 +96,8 @@ def test_split_no_fuse(): x_data = np.random.rand( 12, ).astype("float32") - for tgt, ctx in tvm.testing.enabled_targets(): - res = veval(f, x_data, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + res = veval(f, x_data, device=dev, target=tgt) tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0]) @@ -176,8 +176,8 @@ def test_multiple_ifs(): out = relay.Let(v0, relay.Tuple([relay.const(0)]), out) fn = relay.Function([b], out) mod["main"] = fn - ctx = tvm.runtime.ndarray.context("llvm", 0) - vm = relay.create_executor(ctx=ctx, mod=mod, kind="vm") + dev = tvm.runtime.device("llvm", 0) + vm = relay.create_executor(device=dev, mod=mod, kind="vm") res = vmobj_to_list(vm.evaluate()(False)) assert res == [1, 0] @@ -214,8 +214,8 @@ def test_count_loop(): i_data = np.array(0, dtype="int32") iarg = relay.var("i", shape=[], dtype="int32") mod["main"] = relay.Function([iarg], sum_up(iarg)) - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, i_data, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, i_data, device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), i_data) check_result([i_data], i_data, mod=mod) @@ -283,8 +283,8 @@ def test_list_constructor(): mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) assert len(result) == 2 assert len(result[1]) == 2 @@ -361,8 +361,8 @@ def test_compose(): mod["main"] = f x_data = np.array(np.random.rand()).astype("float32") - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, [x_data], ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, [x_data], device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), x_data + 2.0) @@ -382,8 +382,8 @@ def test_list_hd(): mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), 3) @@ -399,8 +399,8 @@ def test_list_tl_empty_list(): mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) @tvm.testing.uses_gpu @@ -419,8 +419,8 @@ def test_list_tl(): mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array([2, 1])) @@ -441,8 +441,8 @@ def test_list_nth(): f = relay.Function([], nth(l, relay.const(i))) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), expected[i]) @@ -467,8 +467,8 @@ def test_list_update(): f = relay.Function([], l) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array(expected)) @@ -491,8 +491,8 @@ def test_list_length(): f = relay.Function([], l) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), 10) @@ -511,8 +511,8 @@ def test_list_map(): f = relay.Function([], map(add_one_func, l)) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 2])) @@ -531,8 +531,8 @@ def test_list_foldl(): l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil()))) f = relay.Function([], foldl(rev_dup_func, nil(), l)) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 3, 2, 2, 1, 1])) @@ -551,8 +551,8 @@ def test_list_foldr(): l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil()))) f = relay.Function([], foldr(identity_func, nil(), l)) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array([1, 2, 3])) @@ -567,8 +567,8 @@ def test_list_sum(): l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil()))) f = relay.Function([], sum(l)) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(result.asnumpy(), 6) @@ -590,8 +590,8 @@ def test_list_filter(): ) f = relay.Function([], filter(greater_than_one, l)) mod["main"] = f - for tgt, ctx in tvm.testing.enabled_targets(): - result = veval(mod, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + result = veval(mod, device=dev, target=tgt) tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 5])) @@ -603,8 +603,8 @@ def test_closure(): ff = relay.Function([y], f) clo = ff(relay.const(1.0)) main = clo(relay.const(2.0)) - for tgt, ctx in tvm.testing.enabled_targets(): - res = veval(main, ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + res = veval(main, device=dev, target=tgt) tvm.testing.assert_allclose(res.asnumpy(), 3.0) @@ -769,8 +769,8 @@ def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)): x_data = np.random.uniform(size=x_shape).astype("float32") y_data = np.random.uniform(size=y_shape).astype("float32") - for tgt, ctx in tvm.testing.enabled_targets(): - res = veval(f, (x_data, y_data), ctx=ctx, target=tgt) + for tgt, dev in tvm.testing.enabled_targets(): + res = veval(f, (x_data, y_data), device=dev, target=tgt) tvm.testing.assert_allclose(res.asnumpy(), np.reshape(x_data, (1, -1))) diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py index b2a695dc5434f..36f97f815e6e1 100644 --- a/tests/python/relay/test_vm_serialization.py +++ b/tests/python/relay/test_vm_serialization.py @@ -43,29 +43,29 @@ def create_exec(f, target="llvm", params=None): return executable -def get_serialized_output(mod, *data, params=None, target="llvm", ctx=tvm.cpu()): +def get_serialized_output(mod, *data, params=None, target="llvm", device=tvm.cpu()): exe = create_exec(mod, target, params=params) code, lib = exe.save() des_exec = _vm.Executable.load_exec(code, lib) - des_vm = _vm.VirtualMachine(des_exec, ctx) + des_vm = _vm.VirtualMachine(des_exec, device) result = des_vm.run(*data) return result def run_network(mod, params, dtype="float32"): - def get_vm_output(mod, data, params, target, ctx, dtype="float32"): - ex = relay.create_executor("vm", mod=mod, ctx=ctx) + def get_vm_output(mod, data, params, target, device, dtype="float32"): + ex = relay.create_executor("vm", mod=mod, device=device) result = ex.evaluate()(data, **params) return result.asnumpy().astype(dtype) data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape] data = np.random.uniform(size=data_shape).astype(dtype) target = "llvm" - ctx = tvm.cpu(0) + dev = tvm.cpu(0) - tvm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype) + tvm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype) vm_out = get_serialized_output( - mod, tvm.nd.array(data.astype(dtype)), params=params, target=target, ctx=ctx + mod, tvm.nd.array(data.astype(dtype)), params=params, target=target, device=dev ) tvm.testing.assert_allclose(vm_out.asnumpy().astype(dtype), tvm_out, rtol=1e-5, atol=1e-5) @@ -311,8 +311,8 @@ def test_dynamic_bcast(): x_data = np.random.uniform(size=(1, 2)).astype(dtype) y_data = np.random.uniform(size=(3, 2)).astype(dtype) res_np = np.add(x_data, y_data) - for target, ctx in testing.enabled_targets(): - res = get_serialized_output(mod, *(x_data, y_data), target=target, ctx=ctx) + for target, dev in testing.enabled_targets(): + res = get_serialized_output(mod, *(x_data, y_data), target=target, device=dev) tvm.testing.assert_allclose(res.asnumpy(), res_np) diff --git a/tests/python/topi/python/test_fifo_buffer.py b/tests/python/topi/python/test_fifo_buffer.py index 458fabf4e1f36..c2a4f8e7dd844 100644 --- a/tests/python/topi/python/test_fifo_buffer.py +++ b/tests/python/topi/python/test_fifo_buffer.py @@ -46,22 +46,22 @@ def get_ref_data(): # Get the test data buffer_np, data_np, out_np = get_ref_data() - def check_device(device, ctx): - print(" Running on target: {}".format(device)) + def check_device(target, dev): + print(" Running on target: {}".format(target)) - with tvm.target.Target(device): + with tvm.target.Target(target): out = topi.nn.fifo_buffer(data, buffer, axis=axis) - s = tvm.topi.testing.get_injective_schedule(device)([out]) + s = tvm.topi.testing.get_injective_schedule(target)([out]) - buffer_tvm = tvm.nd.array(buffer_np, ctx=ctx) - data_tvm = tvm.nd.array(data_np, ctx=ctx) - out_tvm = tvm.nd.empty(shape=buffer_shape, ctx=ctx, dtype=dtype) - f = tvm.build(s, [data, buffer, out], device, name="fifo") + buffer_tvm = tvm.nd.array(buffer_np, device=dev) + data_tvm = tvm.nd.array(data_np, device=dev) + out_tvm = tvm.nd.empty(shape=buffer_shape, device=dev, dtype=dtype) + f = tvm.build(s, [data, buffer, out], target, name="fifo") f(data_tvm, buffer_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_np) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_conv1d_integration(): @@ -120,49 +120,49 @@ def get_data(): # Get the test data inc_input_np, input_window_np, kernel_np, context_np, output_window_np = get_data() - def check_device(device, ctx): - print(" Running on target: {}".format(device)) + def check_device(target, dev): + print(" Running on target: {}".format(target)) - conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(device) + conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(target) - with tvm.target.Target(device): + with tvm.target.Target(target): out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis) - s = tvm.topi.testing.get_injective_schedule(device)([out]) - update_context = tvm.build(s, [inc_input, context, out], device, name="update_context") + s = tvm.topi.testing.get_injective_schedule(target)([out]) + update_context = tvm.build(s, [inc_input, context, out], target, name="update_context") out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype) s = schedule_conv2d_nchw([out]) - conv2d_inc = tvm.build(s, [context, kernel, out], device, name="conv2d_inc") + conv2d_inc = tvm.build(s, [context, kernel, out], target, name="conv2d_inc") out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis) - s = tvm.topi.testing.get_injective_schedule(device)([out]) + s = tvm.topi.testing.get_injective_schedule(target)([out]) update_output_window = tvm.build( - s, [inc_output, output_window, out], device, name="update_output_window" + s, [inc_output, output_window, out], target, name="update_output_window" ) out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis) - s = tvm.topi.testing.get_injective_schedule(device)([out]) + s = tvm.topi.testing.get_injective_schedule(target)([out]) update_input_window = tvm.build( - s, [inc_input, input_window, out], device, name="update_input_window" + s, [inc_input, input_window, out], target, name="update_input_window" ) out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype) s = schedule_conv2d_nchw([out]) - conv2d = tvm.build(s, [input_window, kernel, out], device, name="conv2d") - - input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx) - new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, ctx=ctx, dtype=dtype) - kernel_tvm = tvm.nd.array(kernel_np, ctx=ctx) - context_tvm = tvm.nd.array(context_np, ctx=ctx) - new_context_tvm = tvm.nd.empty(shape=context_shape, ctx=ctx, dtype=dtype) - inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, ctx=ctx, dtype=dtype) - output_window_tvm = tvm.nd.array(output_window_np, ctx=ctx) - new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype) - output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype) + conv2d = tvm.build(s, [input_window, kernel, out], target, name="conv2d") + + input_window_tvm = tvm.nd.array(input_window_np, device=dev) + new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, device=dev, dtype=dtype) + kernel_tvm = tvm.nd.array(kernel_np, device=dev) + context_tvm = tvm.nd.array(context_np, device=dev) + new_context_tvm = tvm.nd.empty(shape=context_shape, device=dev, dtype=dtype) + inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, device=dev, dtype=dtype) + output_window_tvm = tvm.nd.array(output_window_np, device=dev) + new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype) + output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype) for i in range(num_iteration): # Take i-th slice of inc_input_np - inc_input_tvm = tvm.nd.array(inc_input_np[i], ctx=ctx) + inc_input_tvm = tvm.nd.array(inc_input_np[i], device=dev) # Compute new output window incrementally, using the FIFO buffer op update_context(inc_input_tvm, context_tvm, new_context_tvm) @@ -181,8 +181,8 @@ def check_device(device, ctx): output_window_tvm.asnumpy(), output_window_ref_tvm.asnumpy() ) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py index 69993d287b794..2b75dada3f1ea 100644 --- a/tests/python/topi/python/test_topi_argwhere.py +++ b/tests/python/topi/python/test_topi_argwhere.py @@ -40,27 +40,27 @@ def verify_argwhere(data_shape): out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype) condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype) - def check_device(device, ctx): - ctx = tvm.context(device, 0) - if not ctx.exist or device not in _argwhere_compute: + def check_device(target): + dev = tvm.device(target, 0) + if not dev.exist or target not in _argwhere_compute: return - with tvm.target.Target(device): - out = _argwhere_compute[device](out_shape, condition) - s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule) + with tvm.target.Target(target): + out = _argwhere_compute[target](out_shape, condition) + s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule) sch = s_func(out) - func = tvm.build(sch, [out_shape, condition, out], device, name="argwhere") + func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere") - args = [tvm.nd.array(np_shape, ctx)] - args.append(tvm.nd.array(np_data, ctx)) - args.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=condition.dtype)) + args = [tvm.nd.array(np_shape, dev)] + args.append(tvm.nd.array(np_data, dev)) + args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype)) func(*args) np.set_printoptions(threshold=np.inf) tvm.testing.assert_allclose(args[-1].asnumpy(), np.array(np_out)) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, _ in tvm.testing.enabled_targets(): + check_device(target) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py index 78f85d079f58d..05f2c3029bc9d 100644 --- a/tests/python/topi/python/test_topi_batch_matmul.py +++ b/tests/python/topi/python/test_topi_batch_matmul.py @@ -61,10 +61,10 @@ def get_ref_data(): # get the test data a_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _batch_matmul_implement) out = fcompute(x, y) if not dynamic: s = fschedule([out]) @@ -76,19 +76,19 @@ def check_device(device, ctx): if debug: print(tvm.lower(s, [x, y, out], simple_mode=True)) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), ctx) - f = tvm.build(s, [x, y, out], device, name="dense") + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev) + f = tvm.build(s, [x, y, out], target, name="dense") f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - if dynamic and (device == "cuda" or device == "nvptx"): - print("Dynamic batch matmul test is skippped on %s" % device) + for target, dev in tvm.testing.enabled_targets(): + if dynamic and (target == "cuda" or target == "nvptx"): + print("Dynamic batch matmul test is skippped on %s" % target) continue - check_device(device, ctx) + check_device(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py index 77df5be0a491e..9712aa561d512 100644 --- a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py +++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py @@ -47,15 +47,15 @@ def get_ref_data(): a_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement) out = fcompute(x, y) s = fschedule([out]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), dev) f = tvm.build(s, [x, y, out], device, name="dense") f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3) diff --git a/tests/python/topi/python/test_topi_batch_to_space_nd.py b/tests/python/topi/python/test_topi_batch_to_space_nd.py index 89d044fed9636..7a8a813a196b7 100644 --- a/tests/python/topi/python/test_topi_batch_to_space_nd.py +++ b/tests/python/topi/python/test_topi_batch_to_space_nd.py @@ -42,18 +42,18 @@ def verify_batch_to_space_nd(input_shape, block_shape, crop_begin_list, crop_end a_np, block_shape, crop_begin_list, crop_end_list ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.create(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.create(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d.py b/tests/python/topi/python/test_topi_bitserial_conv2d.py index b0bce44a03f93..4834b9069f9c5 100644 --- a/tests/python/topi/python/test_topi_bitserial_conv2d.py +++ b/tests/python/topi/python/test_topi_bitserial_conv2d.py @@ -72,10 +72,10 @@ def get_ref_data(): a_np, w_np, b_np = get_ref_data() - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], "llvm") func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -125,10 +125,10 @@ def get_ref_data(): a_np, w_np, b_np = get_ref_data() - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], "llvm") func(a, w, b) diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py index 1cd982db54507..2478e92625f7f 100644 --- a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py +++ b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py @@ -67,7 +67,7 @@ def verify_bitserial_conv2d_nhwc( matches = re.findall("vpadd", assembly) assert len(matches) > 0 - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if "arm" not in os.uname()[4]: print("Skipped running code, not an arm device") return @@ -89,9 +89,9 @@ def get_ref_data(): return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], device) func(a, w, b) diff --git a/tests/python/topi/python/test_topi_bitserial_dense.py b/tests/python/topi/python/test_topi_bitserial_dense.py index a624b1b1fede6..1e68fddcede9c 100644 --- a/tests/python/topi/python/test_topi_bitserial_dense.py +++ b/tests/python/topi/python/test_topi_bitserial_dense.py @@ -68,10 +68,10 @@ def get_ref_data(a_shape, b_shape, input_dtype): b_shape = get_const_tuple(B.shape) a_np, b_np, c_np = get_ref_data(a_shape, b_shape, input_dtype) - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build(s, [A, B, C], target) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_bnn.py b/tests/python/topi/python/test_topi_bnn.py index fbd9ac5d66c6d..710489e41d459 100644 --- a/tests/python/topi/python/test_topi_bnn.py +++ b/tests/python/topi/python/test_topi_bnn.py @@ -51,12 +51,12 @@ def get_ref_data(): a_np, b_np, c_np = get_ref_data() - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx) - bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx) - bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), dev) + bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), dev) + bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), dev) f1 = tvm.build(s1, [A, bnn_A], "llvm") f2 = tvm.build(s2, [B, bnn_B], "llvm") f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], "llvm") diff --git a/tests/python/topi/python/test_topi_broadcast.py b/tests/python/topi/python/test_topi_broadcast.py index ada03ea5377bf..1abd2cfc5e502 100644 --- a/tests/python/topi/python/test_topi_broadcast.py +++ b/tests/python/topi/python/test_topi_broadcast.py @@ -28,25 +28,25 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast): A = te.placeholder(shape=in_shape, name="A") B = fbcast(A, out_shape) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="broadcast_to") + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target) - check_device("sdaccel") + for target, dev in tvm.testing.enabled_targets(): + check_target(target) + check_target("sdaccel") def verify_broadcast_binary_ele( @@ -76,7 +76,7 @@ def verify_broadcast_binary_ele( assert isinstance(C, tvm.tir.PrimExpr) return - def gen_operand(shape, low, high, ctx): + def gen_operand(shape, low, high, dev): if shape is None: npy = float(np.random.uniform(low=low, high=high)) if dtype.startswith("int"): @@ -84,30 +84,30 @@ def gen_operand(shape, low, high, ctx): nd = npy else: npy = np.random.uniform(low=low, high=high, size=shape).astype(dtype) - nd = tvm.nd.array(npy, ctx) + nd = tvm.nd.array(npy, dev) return npy, nd - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(C) - foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(C) + foo = tvm.build(s, [A, B, C], target, name="broadcast_binary" + "_" + ftopi.__name__) - lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx) - rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, ctx) + lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, dev) + rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, dev) out_npy = fnumpy(lhs_npy, rhs_npy) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev) foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target) - check_device("sdaccel") + for target, dev in tvm.testing.enabled_targets(): + check_target(target) + check_target("sdaccel") @tvm.testing.uses_gpu @@ -304,22 +304,22 @@ def test_apply( assert isinstance(B, tvm.tir.PrimExpr) return - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name=name) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name=name) data_npy = indata.astype(A.dtype) - data_nd = tvm.nd.array(data_npy, ctx) + data_nd = tvm.nd.array(data_npy, dev) out_npy = f_numpy(indata) - out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), ctx) + out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) test_apply(topi.logical_not, "logical_not", np.logical_not, np.array([True, False, 0, 1])) test_apply(topi.logical_not, "logical_not", np.logical_not, np.array(np.arange(5) < 3)) @@ -342,22 +342,22 @@ def test_apply( assert isinstance(B, tvm.tir.PrimExpr) return - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name=name) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name=name) data_npy = np.random.uniform(size=shape).astype(A.dtype) - data_nd = tvm.nd.array(data_npy, ctx) + data_nd = tvm.nd.array(data_npy, dev) out_npy = f_numpy(data_npy) - out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), ctx) + out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, ()) test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, (2, 1, 2)) @@ -381,22 +381,22 @@ def test_apply( assert isinstance(C, tvm.tir.PrimExpr) return - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(C) - foo = tvm.build(s, [A, B, C], device, name=name) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(C) + foo = tvm.build(s, [A, B, C], target, name=name) - lhs_nd = tvm.nd.array(lhs, ctx) - rhs_nd = tvm.nd.array(rhs, ctx) + lhs_nd = tvm.nd.array(lhs, dev) + rhs_nd = tvm.nd.array(rhs, dev) out_npy = f_numpy(lhs, rhs) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev) foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) test_apply(topi.logical_and, "logical_and", np.logical_and, True, False) test_apply(topi.logical_and, "logical_and", np.logical_and, [True, False], [False, False]) diff --git a/tests/python/topi/python/test_topi_clip.py b/tests/python/topi/python/test_topi_clip.py index 704ffe7e6843e..b8d5321d40f41 100644 --- a/tests/python/topi/python/test_topi_clip.py +++ b/tests/python/topi/python/test_topi_clip.py @@ -39,19 +39,19 @@ def get_ref_data(): a_np, b_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, B], device, name="clip") + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) + f = tvm.build(s, [A, B], target, name="clip") f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py index aad029ce3ce5d..4b1d712824843 100644 --- a/tests/python/topi/python/test_topi_conv1d.py +++ b/tests/python/topi/python/test_topi_conv1d.py @@ -77,25 +77,25 @@ def get_ref_data(layout): a_np, w_np, b_np = get_ref_data(layout) - def check_device(device, ctx): + def check_target(target, dev): if layout == "NCW": - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_ncw_implement) + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_ncw_implement) else: - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_nwc_implement) - with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_nwc_implement) + with tvm.target.Target(target): B = fcompute(A, W, stride, padding, dilation, "float32") s = fschedule([B]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) - func = tvm.build(s, [A, W, B], device) + func = tvm.build(s, [A, W, B], target) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py index 2b8c486b8cd10..bfb60a9168d93 100644 --- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py +++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py @@ -54,28 +54,28 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - ctx = tvm.context(device, 0) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_transpose_ncw_implement) + def check_target(target, dev): + dev = tvm.device(target, 0) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement) B = fcompute(A, W, stride, padding, A.dtype, output_padding) C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - func1 = tvm.build(s1, [A, W, B], device) - func2 = tvm.build(s2, [A, W, C], device) + func1 = tvm.build(s1, [A, W, B], target) + func2 = tvm.build(s2, [A, W, C], target) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py index b1955ef5fa3b3..09bd17ab2a72c 100644 --- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py +++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py @@ -116,7 +116,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -138,10 +138,10 @@ def check_device(device): C = topi.nn.relu(C) s = topi.x86.schedule_conv2d_NCHWc([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/topi/python/test_topi_conv2d_hwcn.py b/tests/python/topi/python/test_topi_conv2d_hwcn.py index bd88839c9c15f..74aa1bfe08808 100644 --- a/tests/python/topi/python/test_topi_conv2d_hwcn.py +++ b/tests/python/topi/python/test_topi_conv2d_hwcn.py @@ -58,30 +58,30 @@ def get_ref_data(): a_np, w_np, b_np, c1_np, c2_np, c3_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_hwcn_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_hwcn_implement) t_conv = fcompute(A, W, stride, padding, dilation) t_bias = topi.add(t_conv, B) t_relu = topi.nn.relu(t_bias) s1 = fschedule([t_conv]) s2 = fschedule([t_bias]) s3 = fschedule([t_relu]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) - conv_out = tvm.nd.array(np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), ctx) - bias_out = tvm.nd.array(np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), ctx) - relu_out = tvm.nd.array(np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), ctx) - func1 = tvm.build(s1, [A, W, t_conv], device) - func2 = tvm.build(s2, [A, W, B, t_bias], device) - func3 = tvm.build(s3, [A, W, B, t_relu], device) + conv_out = tvm.nd.array(np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), dev) + bias_out = tvm.nd.array(np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), dev) + relu_out = tvm.nd.array(np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), dev) + func1 = tvm.build(s1, [A, W, t_conv], target) + func2 = tvm.build(s2, [A, W, B, t_bias], target) + func3 = tvm.build(s3, [A, W, B, t_relu], target) func1(a, w, conv_out) func2(a, w, b, bias_out) func3(a, w, b, relu_out) @@ -89,8 +89,8 @@ def check_device(device): tvm.testing.assert_allclose(bias_out.asnumpy(), c2_np, rtol=1e-5) tvm.testing.assert_allclose(relu_out.asnumpy(), c3_np, rtol=1e-5) - for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]: - check_device(device) + for target in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]: + check_target(target) @tvm.testing.requires_gpu diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py index 9d63175d2e844..bb11a56cdce35 100644 --- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py +++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py @@ -103,28 +103,28 @@ def convert_int32_into_int4(a_int32): a_np = convert_int32_into_int4(a_np) w_np = convert_int32_into_int4(w_np) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - if not nvcc.have_tensorcore(ctx.compute_version): + if not nvcc.have_tensorcore(dev.compute_version): print("skip because gpu does not support Tensor Cores") return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = topi.testing.dispatch(device, _conv2d_hwnc_tensorcore_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, dtype, "int32") s = fschedule([C]) - a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -133,7 +133,7 @@ def check_device(device): rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol) - check_device("cuda") + check_target("cuda") @tvm.testing.requires_tensorcore diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index a934e3ef2fd20..07f7895f47f7a 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -83,16 +83,16 @@ def compile_conv2d_NHWC_gemm_int8_arm( ] for device_tuple in devices: - device = device_tuple[0] + target = device_tuple[0] compute = device_tuple[1] schedule = device_tuple[2] - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Compiling on arm AArch64 target: %s" % device) - with tvm.target.Target(device): + print("Compiling on arm AArch64 target: %s" % target) + with tvm.target.Target(target): assert is_aarch64_arm(), "AArch64 target not recognized" C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) @@ -106,14 +106,14 @@ def compile_conv2d_NHWC_gemm_int8_arm( tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -121,7 +121,7 @@ def compile_conv2d_NHWC_gemm_int8_arm( func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -175,13 +175,13 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): + print("Running on target: %s" % target) + with tvm.target.Target(target): C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( A, W, (stride, stride), padding, (dilation, dilation), dtype ) @@ -191,22 +191,22 @@ def check_device(device): C = topi.nn.relu(C) s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -215,14 +215,14 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - check_device("llvm") + check_target("llvm") oc_block_factor = 4 @@ -284,17 +284,17 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): + if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): print("Skip because int8 intrinsics are not available") return - print("Running on target: %s" % device) - with tvm.target.Target(device): + print("Running on target: %s" % target) + with tvm.target.Target(target): C = topi.cuda.conv2d_NCHWc_int8( A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype ) @@ -304,22 +304,22 @@ def check_device(device): C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_NCHWc_int8([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -328,15 +328,15 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ["cuda"]: - check_device(device) + for target in ["cuda"]: + check_target(target) def verify_conv2d_nchw_int8( @@ -403,17 +403,17 @@ def verify_workload_padding(): tvm.testing.assert_allclose(ow_tile, out_width) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): + if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): print("Skip because int8 intrinsics are not available") return - print("Running on target: %s" % device) - with tvm.target.Target(device): + print("Running on target: %s" % target) + with tvm.target.Target(target): C = topi.cuda.conv2d_nchw_int8( A, W, (stride, stride), padding, (dilation, dilation), dtype ) @@ -423,22 +423,22 @@ def check_device(device): C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_nchw_int8([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -447,7 +447,7 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -456,8 +456,8 @@ def check_device(device): verify_workload_padding() - for device in ["cuda"]: - check_device(device) + for target in ["cuda"]: + check_target(target) @tvm.testing.requires_cuda diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py index 07ad45c971df3..5aff6e8076339 100644 --- a/tests/python/topi/python/test_topi_conv2d_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_nchw.py @@ -89,20 +89,20 @@ def verify_workload_padding(): tvm.testing.assert_allclose(ow_tile, out_width) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) + print("Running on target: %s" % target) - if "cudnn" in device: + if "cudnn" in target: fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn else: - fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(device) + fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(target) - with tvm.target.Target(device): - if "cudnn" in device: + with tvm.target.Target(target): + if "cudnn" in target: C = fcompute( A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype ) @@ -114,19 +114,19 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - if "llvm" in device: + if "llvm" in target: verify_workload_padding() - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -135,19 +135,19 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4) - for device, ctx in tvm.testing.enabled_targets(): - with autotvm.tophub.context(device): # load tophub pre-tuned parameters - check_device(device) + for target, dev in tvm.testing.enabled_targets(): + with autotvm.tophub.context(target): # load tophub pre-tuned parameters + check_target(target) if use_cudnn: - check_device("cuda -model=unknown -libs=cudnn") + check_target("cuda -model=unknown -libs=cudnn") @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py index 8c3b9e931eeac..98a9387e87773 100644 --- a/tests/python/topi/python/test_topi_conv2d_nhwc.py +++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py @@ -67,10 +67,10 @@ def check_device(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_implement) B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) - ctx = tvm.context(device, 0) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + dev = tvm.device(device, 0) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py index 66ce6ffe41f4d..a191f2eb9d370 100644 --- a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py @@ -52,7 +52,7 @@ def get_ref_data(): a_np, w_np, b_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -61,9 +61,9 @@ def check_device(device): with tvm.target.Target(device): B = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NHWC", out_dtype="int32") s = topi.x86.schedule_conv2d_nhwc_pack_int8([B]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py index eab73410dbe66..5761dccf48fc9 100644 --- a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py +++ b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py @@ -83,11 +83,11 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return - if not nvcc.have_tensorcore(ctx.compute_version): + if not nvcc.have_tensorcore(dev.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) @@ -102,10 +102,10 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py index 436270173316a..cb1fd3d233fa7 100644 --- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py +++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py @@ -91,7 +91,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): if bgemm == "direct": @@ -109,10 +109,10 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py index 5cc2c2eb6f5d4..95159a05d54f8 100644 --- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py @@ -62,7 +62,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check(fcompute, fschedule, device, ctx): + def check(fcompute, fschedule, target, dev): B = fcompute( A, W, @@ -74,36 +74,36 @@ def check(fcompute, fschedule, device, ctx): C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - func1 = tvm.build(s1, [A, W, B], device) - func2 = tvm.build(s2, [A, W, C], device) + func1 = tvm.build(s1, [A, W, B], target) + func2 = tvm.build(s2, [A, W, C], target) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - def check_generic(device, ctx): - print("Running generic on target: %s" % device) - with tvm.target.Target(device): + def check_generic(target, dev): + print("Running generic on target: %s" % target) + with tvm.target.Target(target): fcompute, fschedule = _conv2d_transpose_nchw_implement["generic"] - check(fcompute, fschedule, device, ctx) + check(fcompute, fschedule, target, dev) check_generic("llvm", tvm.cpu(0)) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( - device, _conv2d_transpose_nchw_implement + target, _conv2d_transpose_nchw_implement ) - check(fcompute, fschedule, device, ctx) + check(fcompute, fschedule, target, dev) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py index 34febfd9460a4..c91447f1096f8 100644 --- a/tests/python/topi/python/test_topi_conv2d_winograd.py +++ b/tests/python/topi/python/test_topi_conv2d_winograd.py @@ -84,7 +84,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -98,10 +98,10 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, @@ -205,15 +205,15 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() target = "llvm" - ctx = tvm.context(target) + dev = tvm.device(target) C = topi.nn.conv2d_winograd_nhwc(A, W, stride, padding, dilation, dtype) s = te.create_schedule([C.op]) - a = tvm.nd.array(a_np, ctx=ctx) - w = tvm.nd.array(w_np, ctx=ctx) - b = tvm.nd.array(b_np, ctx=ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx=ctx) + a = tvm.nd.array(a_np, device=dev) + w = tvm.nd.array(w_np, device=dev) + b = tvm.nd.array(b_np, device=dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), device=dev) func = tvm.build(s, [A, W, C], target=target) func(a, w, c) diff --git a/tests/python/topi/python/test_topi_conv3d_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_ncdhw.py index 094a71074fa0f..e28aea5bc4f60 100644 --- a/tests/python/topi/python/test_topi_conv3d_ncdhw.py +++ b/tests/python/topi/python/test_topi_conv3d_ncdhw.py @@ -81,10 +81,10 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ncdhw_implement) - with tvm.target.Target(device): + def check_target(target, dev): + print("Running on target: %s" % target) + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ncdhw_implement) + with tvm.target.Target(target): C = fcompute( A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), dtype ) @@ -94,15 +94,15 @@ def check_device(device, ctx): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) @@ -111,16 +111,16 @@ def check_device(device, ctx): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4) - for device, ctx in tvm.testing.enabled_targets(): - with autotvm.tophub.context(device): # load tophub pre-tuned parameters - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + with autotvm.tophub.context(target): # load tophub pre-tuned parameters + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc.py b/tests/python/topi/python/test_topi_conv3d_ndhwc.py index 2d2541af5979c..e5791c3bb4829 100644 --- a/tests/python/topi/python/test_topi_conv3d_ndhwc.py +++ b/tests/python/topi/python/test_topi_conv3d_ndhwc.py @@ -64,22 +64,22 @@ def get_ref_data(): a_np, w_np, b_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ndhwc_implement) - with tvm.target.Target(device): + def check_target(target, dev): + print("Running on target: %s" % target) + fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ndhwc_implement) + with tvm.target.Target(target): B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) - ctx = tvm.context(device, 0) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - func = tvm.build(s, [A, W, B], device) + dev = tvm.device(target, 0) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + func = tvm.build(s, [A, W, B], target) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py index 1e027aba4cd3d..8340ff27fddfe 100644 --- a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py +++ b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py @@ -85,7 +85,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( @@ -98,10 +98,10 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py index 2ac7ccc708ec2..fadbc8d484038 100644 --- a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py +++ b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py @@ -61,11 +61,11 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( - device, _conv3d_transpose_ncdhw_implement + target, _conv3d_transpose_ncdhw_implement ) B = fcompute( A, @@ -78,20 +78,20 @@ def check_device(device, ctx): C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) - func1 = tvm.build(s1, [A, W, B], device) - func2 = tvm.build(s2, [A, W, C], device) + func1 = tvm.build(s1, [A, W, B], target) + func2 = tvm.build(s2, [A, W, C], target) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, atol=1e-4, rtol=1e-4) tvm.testing.assert_allclose(c.asnumpy(), c_np, atol=1e-4, rtol=1e-4) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_conv3d_winograd.py b/tests/python/topi/python/test_topi_conv3d_winograd.py index d00249ba4392d..650ead3ff009d 100644 --- a/tests/python/topi/python/test_topi_conv3d_winograd.py +++ b/tests/python/topi/python/test_topi_conv3d_winograd.py @@ -82,7 +82,7 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -98,10 +98,10 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py index 4709fb7d68f92..33df5a1a00f45 100644 --- a/tests/python/topi/python/test_topi_correlation.py +++ b/tests/python/topi/python/test_topi_correlation.py @@ -65,25 +65,25 @@ def get_ref_data(): a_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _correlation_implement) - with tvm.target.Target(device): + def check_device(target, dev): + print("Running on target: %s" % target) + fcompute, fschedule = tvm.topi.testing.dispatch(target, _correlation_implement) + with tvm.target.Target(target): C = fcompute( A, B, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply ) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.empty(c_np.shape, dtype=dtype, ctx=ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.empty(c_np.shape, dtype=dtype, device=dev) - func = tvm.build(s, [A, B, C], device) + func = tvm.build(s, [A, B, C], target) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py index cd6f33f14fd74..20df09f30d12e 100644 --- a/tests/python/topi/python/test_topi_deformable_conv2d.py +++ b/tests/python/topi/python/test_topi_deformable_conv2d.py @@ -93,7 +93,7 @@ def get_ref_data(): a_np, offset_np, w_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -103,10 +103,10 @@ def check_device(device): C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - offset = tvm.nd.array(offset_np, ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx) + a = tvm.nd.array(a_np, dev) + offset = tvm.nd.array(offset_np, dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev) func = tvm.build(s, [A, Offset, W, C], device) func(a, offset, w, c) @@ -173,7 +173,7 @@ def get_ref_data(): a_np, offset_np, w_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -183,10 +183,10 @@ def check_device(device): C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - offset = tvm.nd.array(offset_np, ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx) + a = tvm.nd.array(a_np, dev) + offset = tvm.nd.array(offset_np, dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev) func = tvm.build(s, [A, Offset, W, C], device) func(a, offset, w, c) diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py index 95ebce43497bf..fa966b6f00e52 100644 --- a/tests/python/topi/python/test_topi_dense.py +++ b/tests/python/topi/python/test_topi_dense.py @@ -64,23 +64,23 @@ def get_ref_data(): # get the test data a_np, b_np, c_np, d_np = get_ref_data() - def check_device(device, ctx): + def check_device(device, dev): print("Running on target: %s" % device) for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement): with tvm.target.Target(device): D = fcompute(A, B, C if use_bias else None) D = topi.nn.relu(D) s = fschedule([D]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for device, dev in tvm.testing.enabled_targets(): + check_device(device, dev) def verify_dense_int8(batch, in_dim, out_dim, use_bias=True): @@ -106,8 +106,8 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) - if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): + dev = tvm.device(device, 0) + if device == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): print("Skip because int8 intrinsics are not available") return @@ -116,10 +116,10 @@ def check_device(device): D = topi.cuda.dense_int8(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) s = topi.cuda.schedule_dense_int8([D]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py index 3ffdea50d660a..4ea3202569cb0 100644 --- a/tests/python/topi/python/test_topi_dense_tensorcore.py +++ b/tests/python/topi/python/test_topi_dense_tensorcore.py @@ -52,17 +52,17 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) print("Running on target: %s" % device) for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement): with tvm.target.Target(device): D = fcompute(A, B, C if use_bias else None) D = topi.nn.relu(D) s = fschedule([D]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-3) diff --git a/tests/python/topi/python/test_topi_depth_to_space.py b/tests/python/topi/python/test_topi_depth_to_space.py index cb16f9bf438f8..93731868e922b 100644 --- a/tests/python/topi/python/test_topi_depth_to_space.py +++ b/tests/python/topi/python/test_topi_depth_to_space.py @@ -51,18 +51,18 @@ def verify_depth_to_space( a_np = np.transpose(a_np, axes=[0, 2, 3, 1]) b_np = np.transpose(b_np, axes=[0, 2, 3, 1]) - def check_device(device, ctx): + def check_device(device, dev): print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for device, dev in tvm.testing.enabled_targets(): + check_device(device, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index 804c486d27d72..91ee1ee02fe43 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -78,16 +78,16 @@ def compile_depthwise_NHWC_int8_arm( bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32") dtype = "int32" - device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu" + target = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu" compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Compiling on arm AArch64 target: %s" % device) - with tvm.target.Target(device): + print("Compiling on arm AArch64 target: %s" % target) + with tvm.target.Target(target): assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized" C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) @@ -102,7 +102,7 @@ def compile_depthwise_NHWC_int8_arm( func = tvm.build( s, ins_outs, - device, + target, name="depthwise_conv2d", ) @@ -133,17 +133,17 @@ def depthwise_conv2d_with_workload_nchw( dtype = "float32" - def check_device(device, ctx): - print("Running on target: %s" % device) + def check_target(target, dev): + print("Running on target: %s" % target) - impl_list = tvm.topi.testing.dispatch(device, _depthwise_conv2d_nchw_implement)[:] - if device == "llvm" and channel_multiplier == 1 and dilation == 1: + impl_list = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nchw_implement)[:] + if target == "llvm" and channel_multiplier == 1 and dilation == 1: impl_list.append( (topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw) ) for fcompute, fschedule in impl_list: - with tvm.target.Target(device): + with tvm.target.Target(target): # declare DepthwiseConv2d = fcompute( Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype @@ -155,9 +155,9 @@ def check_device(device, ctx): s2 = fschedule(ScaleShift) s3 = fschedule(Relu) # build the kernels - f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) - f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) - f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) + f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target) + f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target) + f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target) # Prepare pod type for test data closure input_shape = get_const_tuple(Input.shape) @@ -214,38 +214,38 @@ def verify_workload_padding(): ) # check if tile_ow candidates are the factors of the right output weight. - with tvm.target.Target(device): + with tvm.target.Target(target): cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width) - if "llvm" in device: + if "llvm" in target: verify_workload_padding() - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - scale_tvm = tvm.nd.array(scale_np, ctx) - shift_tvm = tvm.nd.array(shift_np, ctx) + input_tvm = tvm.nd.array(input_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) + scale_tvm = tvm.nd.array(scale_np, dev) + shift_tvm = tvm.nd.array(shift_np, dev) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), - ctx, + dev, ) scale_shift_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx + np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev ) relu_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx + np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev ) # launch kernel 1 (depthwise_conv2d) - timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) + timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) - timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) + timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) - timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) + timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean tvm.testing.assert_allclose( depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5 @@ -253,9 +253,9 @@ def verify_workload_padding(): tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - with autotvm.tophub.context(device): # load tophub pre-tuned parameters - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + with autotvm.tophub.context(target): # load tophub pre-tuned parameters + check_target(target, dev) def depthwise_conv2d_with_workload_nhwc( @@ -284,11 +284,11 @@ def depthwise_conv2d_with_workload_nhwc( dtype = "float32" - def check_device(device, ctx): - print("Running on target: %s" % device) + def check_target(target, dev): + print("Running on target: %s" % target) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _depthwise_conv2d_nhwc_implement) - with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nhwc_implement) + with tvm.target.Target(target): # declare DepthwiseConv2d = fcompute( Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype @@ -300,9 +300,9 @@ def check_device(device, ctx): s2 = fschedule(ScaleShift) s3 = fschedule(Relu) # build the kernels - f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) - f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) - f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) + f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target) + f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target) + f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target) # Prepare pod type for test data closure input_shape = get_const_tuple(Input.shape) @@ -353,25 +353,25 @@ def get_ref_data(): ) = get_ref_data() # prepare data - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - scale_tvm = tvm.nd.array(scale_np, ctx) - shift_tvm = tvm.nd.array(shift_np, ctx) + input_tvm = tvm.nd.array(input_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) + scale_tvm = tvm.nd.array(scale_np, dev) + shift_tvm = tvm.nd.array(shift_np, dev) depthwise_conv2d_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx + np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev ) scale_shift_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx + np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev ) - relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) + relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev) # launch kernel 1 (depthwise_conv2d) - timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) + timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) - timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) + timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) - timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) + timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean relu_scipy = np.maximum(scale_shift_scipy, 0) tvm.testing.assert_allclose( @@ -380,9 +380,9 @@ def get_ref_data(): tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - with autotvm.tophub.context(device): # load tophub pre-tuned parameters - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + with autotvm.tophub.context(target): # load tophub pre-tuned parameters + check_target(target, dev) def _transform_data(data, bn): @@ -444,13 +444,13 @@ def depthwise_conv2d_with_workload_NCHWc( out_layout = "NCHW%dc" % oc_block dtype = "float32" - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): + print("Running on target: %s" % target) + with tvm.target.Target(target): # declare DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc( Input, @@ -468,8 +468,8 @@ def check_device(device): s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d) s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu) # build the kernels - f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) - f2 = tvm.build(s2, [Input, Filter, Relu], device) + f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target) + f2 = tvm.build(s2, [Input, Filter, Relu], target) # Prepare pod type for test data closure input_shape = (batch, in_channel, in_height, in_width) @@ -498,13 +498,13 @@ def get_ref_data(): # Get the test data (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data() - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) + input_tvm = tvm.nd.array(input_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) depthwise_conv2d_tvm = tvm.nd.array( - np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx + np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev ) - relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) + relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev) # launch kernel 1 (depthwise_conv2d) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) @@ -515,9 +515,9 @@ def get_ref_data(): tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend. - for device in ["llvm"]: - with autotvm.tophub.context(device): # load tophub pre-tuned parameters - check_device(device) + for target in ["llvm"]: + with autotvm.tophub.context(target): # load tophub pre-tuned parameters + check_target(target) @tvm.testing.uses_gpu @@ -556,7 +556,7 @@ def test_depthwise_conv2d(): depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME") depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID") - # Test compilation on arm devices + # Test compilation on arm targets compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME") compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True) diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py index 72ad1e29004a6..d8c8f0e195c48 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py @@ -59,7 +59,7 @@ def verify_depthwise_conv2d_back_input( schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -117,11 +117,11 @@ def get_ref_data(): (out_grad_np, filter_np, in_grad_np) = get_ref_data() - out_grad_tvm = tvm.nd.array(out_grad_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), ctx) + out_grad_tvm = tvm.nd.array(out_grad_np, dev) + filter_tvm = tvm.nd.array(filter_np, dev) + in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), dev) # launch the kernel - timer = f.time_evaluator(f.entry_name, ctx, number=1) + timer = f.time_evaluator(f.entry_name, dev, number=1) tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py index 53328113aa719..daf7b5c82d417 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py @@ -54,7 +54,7 @@ def verify_depthwise_conv2d_back_weight( schedule = schedule_depthwise_conv2d_backward_weight_nhwc(Weight_grad) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -101,11 +101,11 @@ def get_ref_data(): (out_grad_np, input_np, weight_grad_np) = get_ref_data() - out_grad_tvm = tvm.nd.array(out_grad_np, ctx) - input_tvm = tvm.nd.array(input_np, ctx) - weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), ctx) + out_grad_tvm = tvm.nd.array(out_grad_np, dev) + input_tvm = tvm.nd.array(input_np, dev) + weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), dev) # launch the kernel - timer = f.time_evaluator(f.entry_name, ctx, number=1) + timer = f.time_evaluator(f.entry_name, dev, number=1) tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4) diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py index 27e71735c565c..c09bcc0deaa63 100644 --- a/tests/python/topi/python/test_topi_dilate.py +++ b/tests/python/topi/python/test_topi_dilate.py @@ -24,7 +24,7 @@ def test_dilate(): target = "llvm" - ctx = tvm.cpu(0) + dev = tvm.cpu(0) def _test_dilate(input_size, strides, dilation_value=None): Input = te.placeholder((input_size)) @@ -38,9 +38,9 @@ def _test_dilate(input_size, strides, dilation_value=None): output_np = tvm.topi.testing.dilate_python(input_np, strides) else: output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value) - input_tvm = tvm.nd.array(input_np, ctx=ctx) + input_tvm = tvm.nd.array(input_np, device=dev) output_size = topi.utils.get_const_tuple(Output.shape) - output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx) + output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), device=dev) f = tvm.build(schedule, [Input, Output], target) f(input_tvm, output_tvm) tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_einsum.py b/tests/python/topi/python/test_topi_einsum.py index 49e951398f404..35de9306deaf8 100644 --- a/tests/python/topi/python/test_topi_einsum.py +++ b/tests/python/topi/python/test_topi_einsum.py @@ -26,15 +26,15 @@ def with_tvm(lam, *args): """Take numpy arrays as args, convert them to TVM tensors and call `lam`. Result of lambda is converted back to numpy array and returned. """ - ctx = tvm.cpu(0) + dev = tvm.cpu(0) pls = [] # placeholders vals_nd = [] # initial values for i, arg in enumerate(args): pls.append(te.placeholder(arg.shape, name="pl" + str(i))) - vals_nd.append(tvm.nd.array(arg, ctx)) + vals_nd.append(tvm.nd.array(arg, dev)) out = lam(*pls) - out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx) + out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev) s = te.create_schedule([out.op]) m = tvm.build(s, pls + [out], "llvm") m(*(vals_nd + [out_nd])) diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py index 9c4da5c2c8499..7f4803b9a8cd8 100644 --- a/tests/python/topi/python/test_topi_group_conv2d.py +++ b/tests/python/topi/python/test_topi_group_conv2d.py @@ -89,15 +89,15 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nchw_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nchw_implement) C = fcompute(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) @@ -105,15 +105,15 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -132,7 +132,7 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -149,8 +149,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ["llvm", "cuda"]: - check_device(device) + for target in ["llvm", "cuda"]: + check_target(target) oc_block_factor = 4 @@ -213,17 +213,17 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): + if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version): print("Skip because int8 intrinsics are not available") return - print("Running on target: %s" % device) - with tvm.target.Target(device): + print("Running on target: %s" % target) + with tvm.target.Target(target): C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) @@ -231,15 +231,15 @@ def check_device(device): C = topi.nn.relu(C) s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -258,7 +258,7 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -275,8 +275,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ["cuda"]: - check_device(device) + for target in ["cuda"]: + check_target(target) def verify_group_conv2d_nhwc( @@ -328,15 +328,15 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nhwc_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nhwc_implement) C = fcompute(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) @@ -344,15 +344,15 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -371,7 +371,7 @@ def check_device(device): func = tvm.build( s, [A, W, C], - device, + target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, @@ -388,8 +388,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ["llvm"]: - check_device(device) + for target in ["llvm"]: + check_target(target) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py index b6cef2e976620..e69a3094ac4de 100644 --- a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py +++ b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py @@ -113,8 +113,8 @@ def get_ref_data(): a_np, w_np, c_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(ctx): + dev = tvm.device(device, 0) + if not tvm.testing.device_enabled(dev): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) @@ -131,9 +131,9 @@ def check_device(device): ) s = topi.x86.schedule_conv2d_NCHWc([C]) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, C], diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py index c605df7037e40..b766e599c6794 100644 --- a/tests/python/topi/python/test_topi_image.py +++ b/tests/python/topi/python/test_topi_image.py @@ -66,19 +66,19 @@ def verify_resize( scale_w = out_width / in_width b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -153,19 +153,19 @@ def verify_resize3d( scale_w = out_width / in_width b_np = tvm.topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -225,21 +225,21 @@ def verify_crop_and_resize( np_images, np_boxes, np_box_indices, np_crop_size, layout, method, extrapolation_value ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out) - tvm_images = tvm.nd.array(np_images, ctx) - tvm_boxes = tvm.nd.array(np_boxes, ctx) - tvm_indices = tvm.nd.array(np_box_indices, ctx) - tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), ctx) - f = tvm.build(s, [images, boxes, box_ind, out], device, name="crop_and_resize") + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out) + tvm_images = tvm.nd.array(np_images, dev) + tvm_boxes = tvm.nd.array(np_boxes, dev) + tvm_indices = tvm.nd.array(np_box_indices, dev) + tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), dev) + f = tvm.build(s, [images, boxes, box_ind, out], target, name="crop_and_resize") f(tvm_images, tvm_boxes, tvm_indices, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), baseline_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) boxes_1 = np.array([[0.2, 0.3, 0.7, 0.9]], dtype="float32") boxes_2 = np.array([[0.2, 0.3, 0.7, 0.9], [0, 0.1, 0.8, 1]], dtype="float32") @@ -272,19 +272,19 @@ def get_ref_data(): data_np, out_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out) - tvm_data = tvm.nd.array(data_np, ctx) - tvm_out = tvm.nd.empty(out_np.shape, dtype, ctx) - f = tvm.build(s, [data, out], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out) + tvm_data = tvm.nd.array(data_np, dev) + tvm_out = tvm.nd.empty(out_np.shape, dtype, dev) + f = tvm.build(s, [data, out], target) f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) verify_affine_grid(1, (16, 32)) verify_affine_grid(4, (16, 32)) @@ -308,20 +308,20 @@ def get_ref_data(): data_np, grid_np, out_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out) - tvm_data = tvm.nd.array(data_np, ctx) - tvm_grid = tvm.nd.array(grid_np, ctx) - tvm_out = tvm.nd.empty(out_np.shape, dtype, ctx) - f = tvm.build(s, [data, grid, out], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out) + tvm_data = tvm.nd.array(data_np, dev) + tvm_grid = tvm.nd.array(grid_np, dev) + tvm_out = tvm.nd.empty(out_np.shape, dtype, dev) + f = tvm.build(s, [data, grid, out], target) f(tvm_data, tvm_grid, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8)) verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32)) diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py index 2789264799771..203680b147816 100644 --- a/tests/python/topi/python/test_topi_lrn.py +++ b/tests/python/topi/python/test_topi_lrn.py @@ -50,9 +50,9 @@ def check_device(device): with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _lrn_schedule) s = s_func([B]) - ctx = tvm.context(device, 0) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) + dev = tvm.device(device, 0) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py index 74575ddba1c38..e2d978190877e 100644 --- a/tests/python/topi/python/test_topi_math.py +++ b/tests/python/topi/python/test_topi_math.py @@ -58,18 +58,18 @@ def test_apply( a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-4 b_np = f_numpy(a_np) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name=name) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros_like(b_np), ctx) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name=name) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros_like(b_np), dev) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def test_isnan( low, @@ -94,18 +94,18 @@ def test_isnan( a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5 b_np = np.isnan(a_np) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="isnan") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros_like(b_np), ctx) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="isnan") + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros_like(b_np), dev) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def test_infiniteness_ops(topi_op, ref_op, name): for dtype in ["float32", "float64", "int32", "int16"]: @@ -125,17 +125,17 @@ def test_infiniteness_ops(topi_op, ref_op, name): ] = np.nan b_np = ref_op(a_np) - def check_device(device, ctx): - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name=name) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros_like(b_np), ctx) + def check_target(target, dev): + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name=name) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros_like(b_np), dev) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) test_apply(topi.floor, "floor", np.floor, -100, 100) test_apply(topi.ceil, "ceil", np.ceil, -100, 100) @@ -177,13 +177,13 @@ def verify(from_dtype, to_dtype, low=-100, high=100): a_np = a_np - a_np[2, 3] b_np = a_np.astype(to_dtype) - for device, ctx in tvm.testing.enabled_targets(): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx) + for target, dev in tvm.testing.enabled_targets(): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.empty(shape=shape, dtype=to_dtype, device=dev) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np) @@ -205,21 +205,21 @@ def test_apply(func, name, f_numpy, low, high, step, dtype="float32"): B = func(A) assert tuple(B.shape) == tuple(A.shape) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - with tvm.target.Target(device): + with tvm.target.Target(target): s = topi.generic.schedule_injective(B) - func = tvm.build(s, [A, B], device, name=name) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros_like(b_np), ctx) + func = tvm.build(s, [A, B], target, name=name) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros_like(b_np), dev) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - check_device("llvm") - check_device("llvm -device=arm-cpu") + check_target("llvm") + check_target("llvm -device=arm-cpu") test_apply(topi.fast_exp, "fast_exp", np.exp, low=-88, high=88, step=0.01) test_apply(topi.fast_erf, "fast_erf", scipy.special.erf, low=-10, high=10, step=0.01) diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py index 26ba6f8142b48..b0d71b315add0 100644 --- a/tests/python/topi/python/test_topi_matmul.py +++ b/tests/python/topi/python/test_topi_matmul.py @@ -26,15 +26,15 @@ def with_tvm(lam, *args): """Take numpy arrays as args, convert them to TVM tensors and call `lam`. Result of lambda is converted back to numpy array and returned. """ - ctx = tvm.cpu(0) + dev = tvm.cpu(0) pls = [] # placeholders vals_nd = [] # initial values for i, arg in enumerate(args): pls.append(te.placeholder(arg.shape, name="pl" + str(i))) - vals_nd.append(tvm.nd.array(arg, ctx)) + vals_nd.append(tvm.nd.array(arg, dev)) out = lam(*pls) - out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx) + out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev) s = te.create_schedule([out.op]) m = tvm.build(s, pls + [out], "llvm") m(*(vals_nd + [out_nd])) diff --git a/tests/python/topi/python/test_topi_pooling.py b/tests/python/topi/python/test_topi_pooling.py index 6f62b8ad969b8..1451d18e42dda 100644 --- a/tests/python/topi/python/test_topi_pooling.py +++ b/tests/python/topi/python/test_topi_pooling.py @@ -106,20 +106,20 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_ ) b_np = np.maximum(b_np, 0.0) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _pool_schedule) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _pool_schedule) s = s_func(B, layout) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=2e-5, atol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def verify_pool_grad( @@ -181,21 +181,21 @@ def verify_pool_grad( if add_relu: pool_grad_np = np.maximum(pool_grad_np, 0.0) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _pool_grad_schedule) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _pool_grad_schedule) s = s_func(PoolGrad) - a = tvm.nd.array(a_np, ctx) - out_grad = tvm.nd.array(out_grad_np, ctx) - pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, OutGrad, PoolGrad], device) + a = tvm.nd.array(a_np, dev) + out_grad = tvm.nd.array(out_grad_np, dev) + pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), dev) + f = tvm.build(s, [A, OutGrad, PoolGrad], target) f(a, out_grad, pool_grad) tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -256,22 +256,22 @@ def verify_global_pool(dshape, pool_type, layout="NCHW"): b_np = np.max(a_np, axis=axis, keepdims=True) b_np = np.maximum(b_np, 0.0) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule) - if device == "cuda": + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule) + if target == "cuda": s = s_func(B, layout) else: s = s_func(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - f = tvm.build(s, [A, B], device) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -300,22 +300,22 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout="NCHW", dtype="floa assert len(out_size) == 3 out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule) - if device == "cuda": + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule) + if target == "cuda": s = s_func(out, layout) else: s = s_func(out) - a = tvm.nd.array(np_data, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx) - f = tvm.build(s, [data, out], device) + a = tvm.nd.array(np_data, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), dev) + f = tvm.build(s, [data, out], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), np_out, rtol=4e-5, atol=1e-6) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -367,20 +367,20 @@ def verify_pool3d( input_np, kernel, stride, padding, output_shape, pool_type, count_include_pad, ceil_mode ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _pool_schedule) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _pool_schedule) s = s_func(B, layout) - a = tvm.nd.array(input_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + a = tvm.nd.array(input_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -428,20 +428,20 @@ def verify_pool1d( input_np, kernel, stride, padding, output_shape, pool_type, count_include_pad, ceil_mode ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _pool_schedule) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _pool_schedule) s = s_func(B, layout) - a = tvm.nd.array(input_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + a = tvm.nd.array(input_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py index 102e93f3b245c..4ad3a80c6a9e7 100644 --- a/tests/python/topi/python/test_topi_prng.py +++ b/tests/python/topi/python/test_topi_prng.py @@ -21,7 +21,7 @@ import numpy as np -def threefry_split(target, ctx, gen): +def threefry_split(target, dev, gen): gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64") left_placeholder, right_placeholder = tvm.topi.random.threefry_split(gen_placeholder) s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder]) @@ -32,7 +32,7 @@ def threefry_split(target, ctx, gen): return left.asnumpy(), right.asnumpy() -def threefry_generate(target, ctx, gen, size): +def threefry_generate(target, dev, gen, size): gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64") left_placeholder, right_placeholder = tvm.topi.random.threefry_generate(gen_placeholder, size) s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder]) @@ -44,10 +44,10 @@ def threefry_generate(target, ctx, gen, size): @tvm.testing.parametrize_targets -def test_threefry_split(target, ctx): +def test_threefry_split(target, dev): # test that results of split do not equal eachother or the input gen = tvm.relay.random.threefry_key(0).data.asnumpy() - a, b = threefry_split(target, ctx, gen) + a, b = threefry_split(target, dev, gen) assert (a != b).any() and ( a != gen ).any(), "Splitting a gen should result in different output gens" @@ -57,37 +57,37 @@ def test_threefry_split(target, ctx): # test enough splits to go over path length for i in range(129): - a, b = threefry_split(target, ctx, b) + a, b = threefry_split(target, dev, b) assert (a[0:4] == b[0:4]).all(), "State part of split should be the same" assert (b[0:4] != np.zeros(4, dtype="uint64")).any() # check that split then generate does not generate the same for both sides - a, a_rands = threefry_generate(target, ctx, a, (100,)) - b, b_rands = threefry_generate(target, ctx, b, (100,)) + a, a_rands = threefry_generate(target, dev, a, (100,)) + b, b_rands = threefry_generate(target, dev, b, (100,)) assert ( a_rands != b_rands ).all(), "Numbers generated from different initial states should be different" # check repeatability - _, rands1 = threefry_generate(target, ctx, a, (100,)) - _, rands2 = threefry_generate(target, ctx, a, (100,)) + _, rands1 = threefry_generate(target, dev, a, (100,)) + _, rands2 = threefry_generate(target, dev, a, (100,)) assert ( rands1 == rands2 ).all(), "Numbers generated from the same initial state should be the same" - a1, b1 = threefry_split(target, ctx, a) - a2, b2 = threefry_split(target, ctx, a) + a1, b1 = threefry_split(target, dev, a) + a2, b2 = threefry_split(target, dev, a) assert (a1 == a2).all() and ( b1 == b2 ).all(), "Split called on the same input should return the same result" @tvm.testing.parametrize_targets -def test_threefry_generate(target, ctx): +def test_threefry_generate(target, dev): gen = tvm.relay.random.threefry_key(0).data.asnumpy() # check that we can generate some data - a, rands = threefry_generate(target, ctx, gen, (2048,)) + a, rands = threefry_generate(target, dev, gen, (2048,)) assert ( rands.shape[0] == 2048 and len(rands.shape) == 1 ), "Output shape should match requested shape" @@ -99,26 +99,26 @@ def test_threefry_generate(target, ctx): gen = np.array( [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64" ) # make counter large - a, rands = threefry_generate(target, ctx, gen, (2048,)) + a, rands = threefry_generate(target, dev, gen, (2048,)) assert gen[4] != a[4], "Overflow of counter should trigger path change" assert a[7] == 2048, "Overflow of counter should still update counter" # check generate with path at length limit gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64") # make counter large - a, rands = threefry_generate(target, ctx, gen, (2048,)) + a, rands = threefry_generate(target, dev, gen, (2048,)) assert ( gen[0:4] != a[0:4] ).any(), "Overflowing counter with no space left in path should change state" @tvm.testing.parametrize_targets -def test_threefry_wrapping(target, ctx): +def test_threefry_wrapping(target, dev): assert tvm.topi.random.threefry_test_wrapping( - target, ctx + target, dev ), f"{target} does not suppport wrapping unsigned integer arithmetic" if __name__ == "__main__": - test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu")) - test_threefry_generate(tvm.target.Target("llvm"), tvm.context("cpu")) - test_threefry_wrapping(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu")) + test_threefry_generate(tvm.target.Target("llvm"), tvm.device("cpu")) + test_threefry_wrapping(tvm.target.Target("llvm"), tvm.device("cpu")) diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py index 386f77335f1a7..1f49a68aa5194 100644 --- a/tests/python/topi/python/test_topi_qnn.py +++ b/tests/python/topi/python/test_topi_qnn.py @@ -37,13 +37,13 @@ def verify_simulated_quantize(data_shape, out_dtype, channels, axis): z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") q_np = np.zeros(shape=data_shape, dtype="float32") - def check_device(device, ctx): + def check_target(target, dev): # Wrap the numpy arrays in nd arrays. - a = tvm.nd.array(a_np, ctx) - d = tvm.nd.array(d_np, ctx) - s = tvm.nd.array(s_np, ctx) - z = tvm.nd.array(z_np, ctx) - q = tvm.nd.array(q_np, ctx) + a = tvm.nd.array(a_np, dev) + d = tvm.nd.array(d_np, dev) + s = tvm.nd.array(s_np, dev) + z = tvm.nd.array(z_np, dev) + q = tvm.nd.array(q_np, dev) # Construct equivalent relay graph. per_channel = channels[0] != 1 @@ -56,19 +56,19 @@ def check_device(device, ctx): z_var = relay.const(z_np[0]) real_q_op = relay.qnn.op.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype) with tvm.transform.PassContext(opt_level=3): - lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=device) + lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=target) # Get real qnn quantize output. - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) m.set_input("a", a_np) m.run() real_q_out = m.get_output(0) # Compile the simulated quantize function. - with tvm.target.Target(device): - sched = tvm.topi.testing.get_injective_schedule(device)(SIM_Q) - func = tvm.build(sched, [A, D, S, Z, SIM_Q], device, name="sim_quantize") + with tvm.target.Target(target): + sched = tvm.topi.testing.get_injective_schedule(target)(SIM_Q) + func = tvm.build(sched, [A, D, S, Z, SIM_Q], target, name="sim_quantize") func(a, d, s, z, q) # Check correctness against the true qnn output. @@ -76,8 +76,8 @@ def check_device(device, ctx): # Allow some rounding errors due to GPU fp32 arithmetic. assert np.sum(mismatch) <= 3 - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def test_simulated_quantize(): @@ -104,13 +104,13 @@ def verify_simulated_dequantize(data_shape, in_dtype, channels, axis): z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") dq_np = np.zeros(shape=data_shape, dtype="float32") - def check_device(device, ctx): + def check_target(target, dev): # Wrap the numpy arrays in nd arrays. - a = tvm.nd.array(a_np_f, ctx) - d = tvm.nd.array(d_np, ctx) - s = tvm.nd.array(s_np, ctx) - z = tvm.nd.array(z_np, ctx) - dq = tvm.nd.array(dq_np, ctx) + a = tvm.nd.array(a_np_f, dev) + d = tvm.nd.array(d_np, dev) + s = tvm.nd.array(s_np, dev) + z = tvm.nd.array(z_np, dev) + dq = tvm.nd.array(dq_np, dev) # Construct equivalent relay graph. per_channel = channels[0] != 1 @@ -123,19 +123,19 @@ def check_device(device, ctx): z_var = relay.const(z_np[0]) real_dq_op = relay.qnn.op.dequantize(a_var, s_var, z_var, axis=axis) with tvm.transform.PassContext(opt_level=3): - lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=device) + lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=target) # Get real qnn quantize output. - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) m.set_input("a", a_np) m.run() real_dq_out = m.get_output(0) # Compile the simulated quantize function. - with tvm.target.Target(device): - sched = tvm.topi.testing.get_injective_schedule(device)(SIM_DQ) - func = tvm.build(sched, [A, D, S, Z, SIM_DQ], device, name="sim_quantize") + with tvm.target.Target(target): + sched = tvm.topi.testing.get_injective_schedule(target)(SIM_DQ) + func = tvm.build(sched, [A, D, S, Z, SIM_DQ], target, name="sim_quantize") func(a, d, s, z, dq) # Check correctness against the true qnn output. @@ -143,8 +143,8 @@ def check_device(device, ctx): dq.asnumpy(), real_dq_out.asnumpy().astype("float32"), rtol=1e-5 ) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def test_simulated_dequantize(): diff --git a/tests/python/topi/python/test_topi_reduce.py b/tests/python/topi/python/test_topi_reduce.py index 9ddcb0d3884b8..c6de8d7c7f4df 100644 --- a/tests/python/topi/python/test_topi_reduce.py +++ b/tests/python/topi/python/test_topi_reduce.py @@ -69,7 +69,7 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32") else: raise NotImplementedError - def check_device(device, ctx): + def check_device(device, dev): print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_reduce_schedule(device)(B) @@ -98,8 +98,8 @@ def check_device(device, ctx): out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError - data_tvm = tvm.nd.array(in_npy, ctx=ctx) - out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) + data_tvm = tvm.nd.array(in_npy, device=dev) + out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) if type == "argmax" or type == "argmin": @@ -119,8 +119,8 @@ def check_device(device, ctx): else: tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1e-3, 1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for device, dev in tvm.testing.enabled_targets(): + check_device(device, dev) @tvm.testing.uses_gpu @@ -163,7 +163,7 @@ def test_complex_reduce(): C = topi.add(B, B) D = topi.multiply(B, B) E = topi.add(C, D) - for device, ctx in tvm.testing.enabled_targets(): + for device, dev in tvm.testing.enabled_targets(): print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_reduce_schedule(device)(E) @@ -171,8 +171,8 @@ def test_complex_reduce(): in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype) sum_npy = in_npy.sum(axis=axis, keepdims=keepdims) out_npy = sum_npy * 2 + sum_npy * sum_npy - data_tvm = tvm.nd.array(in_npy, ctx=ctx) - out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=dtype) + data_tvm = tvm.nd.array(in_npy, device=dev) + out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=dtype) foo(data_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1e-3, 1e-3) diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py index 7c45acae05700..9acf98d8259f6 100644 --- a/tests/python/topi/python/test_topi_relu.py +++ b/tests/python/topi/python/test_topi_relu.py @@ -34,22 +34,22 @@ def verify_relu(m, n, dtype="float32"): a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) - def check_device(device, ctx): - if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version): - print("Skip because %s does not have fp16 support" % device) + def check_target(target, dev): + if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version): + print("Skip because %s does not have fp16 support" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_elemwise_schedule(device)(B) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_elemwise_schedule(target)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - foo = tvm.build(s, [A, B], device, name="relu") + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + foo = tvm.build(s, [A, B], target, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) def verify_leaky_relu(m, alpha): @@ -59,9 +59,9 @@ def verify_leaky_relu(m, alpha): a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) foo = tvm.build(s, [A, B], "llvm", name="leaky_relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -79,11 +79,11 @@ def _prelu_numpy(x, W): B = topi.nn.prelu(X, W, axis) s = te.create_schedule([B.op]) - ctx = tvm.cpu(0) - x_tvm = tvm.nd.array(x_np, ctx) - w_tvm = tvm.nd.array(w_np, ctx) + dev = tvm.cpu(0) + x_tvm = tvm.nd.array(x_np, dev) + w_tvm = tvm.nd.array(w_np, dev) - b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), dev) foo = tvm.build(s, [X, W, B], "llvm", name="prelu") foo(x_tvm, w_tvm, b) out_np = _prelu_numpy(x_np, w_np) diff --git a/tests/python/topi/python/test_topi_reorg.py b/tests/python/topi/python/test_topi_reorg.py index 93464d9bef038..e26a05287e05d 100644 --- a/tests/python/topi/python/test_topi_reorg.py +++ b/tests/python/topi/python/test_topi_reorg.py @@ -48,7 +48,7 @@ def get_ref_data_reorg(): def check_device(device): """Cheching devices is enabled or not""" - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -56,8 +56,8 @@ def check_device(device): with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _reorg_schedule) s = s_func([B]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, B], device) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_scan.py b/tests/python/topi/python/test_topi_scan.py index 020fde51659f1..cd77a1ccfbceb 100644 --- a/tests/python/topi/python/test_topi_scan.py +++ b/tests/python/topi/python/test_topi_scan.py @@ -59,7 +59,7 @@ def get_implementations(name, axis, dtype, exclusive): def _run_tests( - ctx, + dev, target, op_name: str = "cumsum", gt_func: Callable[..., np.array] = np.cumsum, @@ -67,7 +67,7 @@ def _run_tests( def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False): implementations = get_implementations(op_name, axis, dtype, exclusive) fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) - tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule) + tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, dev, fcompute, fschedule) data = np.array([2, 3, 0]) check_scan(gt_func(data), data) @@ -121,24 +121,24 @@ def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False): @tvm.testing.parametrize_targets -def test_cumsum(ctx, target): - _run_tests(ctx, target, op_name="cumsum", gt_func=np.cumsum) +def test_cumsum(dev, target): + _run_tests(dev, target, op_name="cumsum", gt_func=np.cumsum) @tvm.testing.parametrize_targets -def test_cumprod(ctx, target): - _run_tests(ctx, target, op_name="cumprod", gt_func=np.cumprod) +def test_cumprod(dev, target): + _run_tests(dev, target, op_name="cumprod", gt_func=np.cumprod) if __name__ == "__main__": - test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) - test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) - test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx")) - test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan")) - test_cumsum(tvm.context("metal"), tvm.target.Target("metal")) - - test_cumprod(tvm.context("cpu"), tvm.target.Target("llvm")) - test_cumprod(tvm.context("cuda"), tvm.target.Target("cuda")) - test_cumprod(tvm.context("nvptx"), tvm.target.Target("nvptx")) - test_cumprod(tvm.context("vulkan"), tvm.target.Target("vulkan")) - test_cumprod(tvm.context("metal"), tvm.target.Target("metal")) + test_cumsum(tvm.device("cpu"), tvm.target.Target("llvm")) + test_cumsum(tvm.device("cuda"), tvm.target.Target("cuda")) + test_cumsum(tvm.device("nvptx"), tvm.target.Target("nvptx")) + test_cumsum(tvm.device("vulkan"), tvm.target.Target("vulkan")) + test_cumsum(tvm.device("metal"), tvm.target.Target("metal")) + + test_cumprod(tvm.device("cpu"), tvm.target.Target("llvm")) + test_cumprod(tvm.device("cuda"), tvm.target.Target("cuda")) + test_cumprod(tvm.device("nvptx"), tvm.target.Target("nvptx")) + test_cumprod(tvm.device("vulkan"), tvm.target.Target("vulkan")) + test_cumprod(tvm.device("metal"), tvm.target.Target("metal")) diff --git a/tests/python/topi/python/test_topi_scatter.py b/tests/python/topi/python/test_topi_scatter.py index 2e701e2903d96..ad73bb51f2d36 100644 --- a/tests/python/topi/python/test_topi_scatter.py +++ b/tests/python/topi/python/test_topi_scatter.py @@ -22,7 +22,7 @@ @tvm.testing.parametrize_targets -def test_scatter_nd(ctx, target): +def test_scatter_nd(dev, target): def check_scatter_nd(data, indices, shape, out): implementations = { "generic": (lambda x, y: topi.scatter_nd(x, y, shape), topi.generic.schedule_extern), @@ -30,7 +30,7 @@ def check_scatter_nd(data, indices, shape, out): "cpu": (lambda x, y: topi.x86.scatter_nd(x, y, shape), topi.generic.schedule_extern), } fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) - tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, ctx, fcompute, fschedule) + tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, dev, fcompute, fschedule) data = np.array([2, 3, 0]) indices = np.array([[1, 1, 0], [0, 1, 0]]) @@ -64,4 +64,4 @@ def check_scatter_nd(data, indices, shape, out): if __name__ == "__main__": - test_scatter_nd(tvm.context("cpu"), tvm.target.Target("llvm")) + test_scatter_nd(tvm.device("cpu"), tvm.target.Target("llvm")) diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py index 66c44f937c5ed..84fa0d24e4345 100644 --- a/tests/python/topi/python/test_topi_softmax.py +++ b/tests/python/topi/python/test_topi_softmax.py @@ -34,15 +34,15 @@ } -def check_device(A, B, a_np, b_np, device, ctx, name): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s_func = tvm.topi.testing.dispatch(device, _softmax_schedule) +def check_target(A, B, a_np, b_np, target, dev, name): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _softmax_schedule) s = s_func(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - f = tvm.build(s, [A, B], device, name=name) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) + f = tvm.build(s, [A, B], target, name=name) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -57,8 +57,8 @@ def verify_softmax(m, n, dtype="float32"): a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = tvm.topi.testing.softmax_python(a_np) - for device, ctx in tvm.testing.enabled_targets(): - check_device(A, B, a_np, b_np, device, ctx, "softmax") + for target, dev in tvm.testing.enabled_targets(): + check_target(A, B, a_np, b_np, target, dev, "softmax") def verify_softmax_4d(shape, dtype="float32"): @@ -70,8 +70,8 @@ def verify_softmax_4d(shape, dtype="float32"): b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h * w, c)) b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2) - for device, ctx in tvm.testing.enabled_targets(): - check_device(A, B, a_np, b_np, device, ctx, "softmax") + for target, dev in tvm.testing.enabled_targets(): + check_target(A, B, a_np, b_np, target, dev, "softmax") @tvm.testing.uses_gpu @@ -91,8 +91,8 @@ def verify_log_softmax(m, n, dtype="float32"): a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = tvm.topi.testing.log_softmax_python(a_np) - for device, ctx in tvm.testing.enabled_targets(): - check_device(A, B, a_np, b_np, device, ctx, "log_softmax") + for target, dev in tvm.testing.enabled_targets(): + check_target(A, B, a_np, b_np, target, dev, "log_softmax") @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py index 85a35488ab220..c52dc8d3929ac 100644 --- a/tests/python/topi/python/test_topi_sort.py +++ b/tests/python/topi/python/test_topi_sort.py @@ -58,25 +58,25 @@ def verify_sort(axis, is_ascend): else: np_sort = np_sort[:, : dshape[axis]] - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - ctx = tvm.context(device, 0) - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _sort_implement) + dev = tvm.device(target, 0) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _sort_implement) out = fcompute(data, axis=axis, is_ascend=is_ascend) s = fschedule(out) - tvm_data = tvm.nd.array(np_data, ctx) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx) - f = tvm.build(s, [data, out], device) + tvm_data = tvm.nd.array(np_data, dev) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev) + f = tvm.build(s, [data, out], target) f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_sort, rtol=1e0) - for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: + check_target(target) def verify_argsort(axis, is_ascend): @@ -98,25 +98,25 @@ def verify_argsort(axis, is_ascend): else: np_indices = np_indices[:, : dshape[axis]] - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - ctx = tvm.context(device, 0) - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _argsort_implement) + dev = tvm.device(target, 0) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _argsort_implement) out = fcompute(data, axis=axis, is_ascend=is_ascend) s = fschedule(out) - tvm_data = tvm.nd.array(np_data, ctx) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx) - f = tvm.build(s, [data, out], device) + tvm_data = tvm.nd.array(np_data, dev) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev) + f = tvm.build(s, [data, out], target) f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0) - for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: + check_target(target) def verify_topk(k, axis, ret_type, is_ascend, dtype): @@ -142,22 +142,22 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype): np_values[i, :] = np_data[i, np_indices[i, :]] np_indices = np_indices.astype(dtype) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _topk_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _topk_implement) outs = fcompute(data, k, axis, ret_type, is_ascend, dtype) outs = outs if isinstance(outs, list) else [outs] s = fschedule(outs) - tvm_data = tvm.nd.array(np_data, ctx) + tvm_data = tvm.nd.array(np_data, dev) tvm_res = [] for t in outs: - tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, ctx=ctx)) - f = tvm.build(s, [data] + outs, device) + tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, device=dev)) + f = tvm.build(s, [data] + outs, target) f(tvm_data, *tvm_res) if ret_type == "both": tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_values) @@ -167,8 +167,8 @@ def check_device(device): else: tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_indices) - for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: + check_target(target) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_space_to_batch_nd.py b/tests/python/topi/python/test_topi_space_to_batch_nd.py index 6f969f3910020..21654dd9f0847 100644 --- a/tests/python/topi/python/test_topi_space_to_batch_nd.py +++ b/tests/python/topi/python/test_topi_space_to_batch_nd.py @@ -42,18 +42,18 @@ def verify_space_to_batch_nd(input_shape, block_shape, pad_before, pad_after, pa a_np, block_shape, pad_before, pad_after, pad_value ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.create(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.create(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_space_to_depth.py b/tests/python/topi/python/test_topi_space_to_depth.py index 397018688eb4f..7fd49dc363cbd 100644 --- a/tests/python/topi/python/test_topi_space_to_depth.py +++ b/tests/python/topi/python/test_topi_space_to_depth.py @@ -49,18 +49,18 @@ def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, la a_np = np.transpose(a_np, axes=[0, 2, 3, 1]) b_np = np.transpose(b_np, axes=[0, 2, 3, 1]) - def check_device(device, ctx): + def check_device(device, dev): print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for device, dev in tvm.testing.enabled_targets(): + check_device(device, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py index d84bd1530587f..500384b23f2a5 100644 --- a/tests/python/topi/python/test_topi_sparse.py +++ b/tests/python/topi/python/test_topi_sparse.py @@ -59,17 +59,17 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - a = tvmsp.array(a_np, ctx) + a = tvmsp.array(a_np, dev) _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0] assert a.shape[0] == a.indptr.shape[0] - 1 - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), ctx) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), dev) assert a.data.dtype == A.data.dtype assert a.indices.dtype == A.indices.dtype assert a.indptr.dtype == A.indptr.dtype @@ -105,17 +105,17 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - a = tvmsp.array(a_np, ctx) + a = tvmsp.array(a_np, dev) _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0] assert a.shape[0] == a.indptr.shape[0] - 1 - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), ctx) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), dev) f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm") f(_nr, a.data, a.indices, a.indptr, b, c, d) @@ -152,15 +152,15 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - a = tvmsp.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + a = tvmsp.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev) f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense") f(a.data, a.indices, a.indptr, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4) @@ -195,15 +195,15 @@ def get_ref_data(): a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - a = tvm.nd.array(a_np, ctx) - b = tvmsp.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvmsp.array(b_np, dev) + c = tvm.nd.array(c_np, dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev) f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense") f(a, b.data, b.indices, b.indptr, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4) @@ -355,7 +355,7 @@ def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype): return s -def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, ctx, target): +def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, device, target): X_np = np.random.randn(M, K).astype("float32") W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32") W_np = W_sp_np.todense() @@ -375,22 +375,22 @@ def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, ctx, target) Y = topi.nn.relu(Y) s = fschedule([Y]) func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y]) - Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx) + Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=device) func( - tvm.nd.array(X_np, ctx=ctx), - tvm.nd.array(W_sp_np.data, ctx=ctx), - tvm.nd.array(W_sp_np.indices, ctx=ctx), - tvm.nd.array(W_sp_np.indptr, ctx=ctx), + tvm.nd.array(X_np, device=device), + tvm.nd.array(W_sp_np.data, device=device), + tvm.nd.array(W_sp_np.indices, device=device), + tvm.nd.array(W_sp_np.indptr, device=device), Y_tvm, ) tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-4, rtol=1e-4) @tvm.testing.parametrize_targets("llvm", "cuda") -def test_sparse_dense_bsr_relu(ctx, target): +def test_sparse_dense_bsr_relu(dev, target): M, N, K, BS_R, BS_C, density = 1, 64, 128, 8, 16, 0.9 - verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, True, ctx, target) - verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, False, ctx, target) + verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, True, dev, target) + verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, False, dev, target) def test_sparse_dense_bsr_reverse(): @@ -439,7 +439,7 @@ def test_sparse_dense_bsr_randomized(): X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype)) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -449,12 +449,12 @@ def check_device(device): Y = fcompute(X, W_data, W_indices, W_indptr) s = fschedule([Y]) func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y]) - Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx) + Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev) func( - tvm.nd.array(X_np, ctx=ctx), - tvm.nd.array(W_sp_np.data, ctx=ctx), - tvm.nd.array(W_sp_np.indices, ctx=ctx), - tvm.nd.array(W_sp_np.indptr, ctx=ctx), + tvm.nd.array(X_np, device=dev), + tvm.nd.array(W_sp_np.data, device=dev), + tvm.nd.array(W_sp_np.indices, device=dev), + tvm.nd.array(W_sp_np.indptr, device=dev), Y_tvm, ) tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-5, rtol=1e-5) @@ -484,16 +484,16 @@ def test_sparse_dense_padded_cuda(): ) X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype)) with tvm.target.Target("cuda"): - ctx = tvm.context("gpu") + dev = tvm.device("gpu") Y = topi.cuda.sparse_dense_padded(X, W_data, W_indices, W_indptr) s = topi.cuda.schedule_sparse_dense_padded([Y]) func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y]) - Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx) + Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev) func( - tvm.nd.array(X_np, ctx=ctx), - tvm.nd.array(W_sp_np_padded.data, ctx=ctx), - tvm.nd.array(W_sp_np_padded.indices, ctx=ctx), - tvm.nd.array(W_sp_np_padded.indptr, ctx=ctx), + tvm.nd.array(X_np, device=dev), + tvm.nd.array(W_sp_np_padded.data, device=dev), + tvm.nd.array(W_sp_np_padded.indices, device=dev), + tvm.nd.array(W_sp_np_padded.indptr, device=dev), Y_tvm, ) tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-5, rtol=1e-5) diff --git a/tests/python/topi/python/test_topi_tensor.py b/tests/python/topi/python/test_topi_tensor.py index d384767e17a98..d395c0c4e62fa 100644 --- a/tests/python/topi/python/test_topi_tensor.py +++ b/tests/python/topi/python/test_topi_tensor.py @@ -41,21 +41,21 @@ def get_ref_data(): np_nd = get_ref_data() - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - ctx = tvm.context(device, 0) - out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) - f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum") - tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out] + dev = tvm.device(target, 0) + out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev) + f = tvm.build(s, tvm_placeholders + [esum], target, name="elemwise_sum") + tvm_nd = [tvm.nd.array(nd, dev) for nd in np_nd] + [out] f(*tvm_nd) np_out = np.sum(np.array(np_nd), axis=0) tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5) - for device in ["llvm"]: - check_device(device) + for target in ["llvm"]: + check_target(target) def verify_full(shape, dtype, fill_value): @@ -71,47 +71,47 @@ def get_ref_data(): np_nd = get_ref_data() - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_target(target): + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - ctx = tvm.context(device, 0) - out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) - f = tvm.build(s1, [A, B], device, name="full_like") - f(tvm.nd.array(np.zeros(shape, dtype), ctx), out) + dev = tvm.device(target, 0) + out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev) + f = tvm.build(s1, [A, B], target, name="full_like") + f(tvm.nd.array(np.zeros(shape, dtype), dev), out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5) - f = tvm.build(s2, [C], device, name="full") + f = tvm.build(s2, [C], target, name="full") f(out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5) - for device in ["llvm"]: - check_device(device) + for target in ["llvm"]: + check_target(target) def verify_vectorization(n, m, dtype): - def check_device(device): - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_targeta(targeta): + if not tvm.testing.device_enabled(targeta): + print("Skip because %s is not enabled" % targeta) return - if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version): + if dtype == "float16" and targeta == "cuda" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return - with tvm.target.Target(device): - ctx = tvm.context(device, 0) + with tvm.target.Target(targeta): + dev = tvm.device(targeta, 0) A = te.placeholder((n, m), name="A", dtype=dtype) B = te.compute((n, m), lambda i, j: A[i, j] + tvm.tir.const(1, A.dtype), name="B") - S = tvm.topi.testing.get_elemwise_schedule(device)(B) + S = tvm.topi.testing.get_elemwise_schedule(targeta)(B) - fun = tvm.build(S, [A, B], device) - np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, m))) - np_B = tvm.nd.empty((n, m), B.dtype, ctx) + fun = tvm.build(S, [A, B], targeta) + np_A = tvm.nd.empty((n, m), A.dtype, dev).copyfrom(np.random.uniform(size=(n, m))) + np_B = tvm.nd.empty((n, m), B.dtype, dev) fun(np_A, np_B) tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5) - for device in ["cuda"]: - check_device(device) + for targeta in ["cuda"]: + check_targeta(targeta) @tvm.testing.requires_gpu diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py index e0018ba0c0d32..16f9f13f05b09 100644 --- a/tests/python/topi/python/test_topi_transform.py +++ b/tests/python/topi/python/test_topi_transform.py @@ -30,105 +30,105 @@ def verify_expand_dims(in_shape, out_shape, axis, num_newaxis): A = te.placeholder(shape=in_shape, name="A") B = topi.expand_dims(A, axis, num_newaxis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="expand_dims") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="expand_dims") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = data_npy.reshape(out_shape) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_reinterpret(in_shape, in_dtype, out_dtype, generator): A = te.placeholder(shape=in_shape, name="A", dtype=in_dtype) B = topi.reinterpret(A, out_dtype) - def check_device(device, ctx): - if in_dtype == "float16" and device == "cuda" and not have_fp16(ctx.compute_version): - print("Skip because %s does not have fp16 support" % device) + def check_device(target, dev): + if in_dtype == "float16" and target == "cuda" and not have_fp16(dev.compute_version): + print("Skip because %s does not have fp16 support" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_elemwise_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="reinterpret") + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_elemwise_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="reinterpret") data_npy = generator(in_shape).astype(in_dtype) out_npy = data_npy.view(B.dtype) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), ctx) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), dev) foo(data_nd, out_nd) np.testing.assert_equal(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_transpose(in_shape, axes): A = te.placeholder(shape=in_shape, name="A") B = topi.transpose(A, axes) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="transpose") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="transpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) out_npy = data_npy.transpose(axes) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_reshape(src_shape, dst_shape): A = te.placeholder(shape=src_shape, name="A") B = topi.reshape(A, dst_shape) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="reshape") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="reshape") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npy = np.reshape(data_npy, newshape=dst_shape) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_squeeze(src_shape, axis): A = te.placeholder(shape=src_shape, name="A") B = topi.squeeze(A, axis=axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) - foo = tvm.build(s, [A, B], device, name="squeeze") + foo = tvm.build(s, [A, B], target, name="squeeze") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npy = np.squeeze(data_npy, axis=axis) - data_nd = tvm.nd.array(data_npy, ctx) + data_nd = tvm.nd.array(data_npy, dev) out_nd_shape = out_npy.shape - out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype) + out_nd = tvm.nd.empty(out_nd_shape, device=dev, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_concatenate(shapes, axis): @@ -149,21 +149,21 @@ def get_concat_schedule(target): tensor_l.append(te.placeholder(shape, name="A" + str(i))) out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = get_concat_schedule(device)(out_tensor) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = get_concat_schedule(target)(out_tensor) - foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate") + foo = tvm.build(s, tensor_l + [out_tensor], target, name="concatenate") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] out_npy = np.concatenate(data_npys, axis=axis) - data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys] - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype) + data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys] + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype) foo(*(data_nds + [out_nd])) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_stack(shapes, axis): @@ -172,45 +172,45 @@ def verify_stack(shapes, axis): tensor_l.append(te.placeholder(shape, name="A" + str(i))) out_tensor = topi.stack(tensor_l, axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(out_tensor) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(out_tensor) - foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack") + foo = tvm.build(s, tensor_l + [out_tensor], target, name="stack") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] out_npy = np.stack(data_npys, axis=axis) - data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys] - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype) + data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys] + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype) foo(*(data_nds + [out_nd])) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_split(src_shape, indices_or_sections, axis): A = te.placeholder(shape=src_shape, name="A") tensor_l = topi.split(A, indices_or_sections, axis=axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(tensor_l) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(tensor_l) - foo = tvm.build(s, [A] + list(tensor_l), device, name="split") + foo = tvm.build(s, [A] + list(tensor_l), target, name="split") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npys = np.split(data_npy, indices_or_sections, axis=axis) - data_nd = tvm.nd.array(data_npy, ctx) + data_nd = tvm.nd.array(data_npy, dev) out_nds = [ - tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys + tvm.nd.empty(out_npy.shape, device=dev, dtype=tensor_l[0].dtype) for out_npy in out_npys ] foo(*([data_nd] + out_nds)) for out_nd, out_npy in zip(out_nds, out_npys): tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_expand_like(in_shape, out_shape, axis): @@ -219,13 +219,13 @@ def verify_expand_like(in_shape, out_shape, axis): C = topi.expand_like(A, B, axis) s = te.create_schedule([C.op]) - def check_device(device): - print("Running on target: %s" % device) + def check_device(target): + print("Running on target: %s" % target) - ctx = tvm.context(device, 0) - f = tvm.build(s, [A, B, C], device, name="expand_like") + dev = tvm.device(target, 0) + f = tvm.build(s, [A, B, C], target, name="expand_like") input = np.random.uniform(size=in_shape).astype(A.dtype) - tvm_input = tvm.nd.array(input, ctx) + tvm_input = tvm.nd.array(input, dev) odim = len(out_shape) real_axis = [x if x >= 0 else x + odim for x in axis] @@ -236,38 +236,38 @@ def check_device(device): input = np.concatenate([input] * out_shape[x], axis=x).astype(A.dtype) assert input.shape == out_shape - tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx) - out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx) + tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), dev) + out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), dev) f(tvm_input, tvm_shape_like, out) tvm.testing.assert_allclose(out.asnumpy(), input) - for device in ["llvm"]: - check_device(device) + for target in ["llvm"]: + check_device(target) def verify_flip(in_shape, axis): A = te.placeholder(shape=in_shape, name="A") B = topi.flip(A, axis) + 1 - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) - foo = tvm.build(s, [A, B], device, name="reverse") + foo = tvm.build(s, [A, B], target, name="reverse") x_np = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.flip(x_np, axis) + 1 - data_nd = tvm.nd.array(x_np, ctx) - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype) + data_nd = tvm.nd.array(x_np, dev) + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]: + check_device(target) @tvm.testing.uses_gpu @@ -278,21 +278,21 @@ def verify_reverse_sequence(in_data, seq_lengths, batch_axis, seq_axis, ref_res) B = te.placeholder(shape=seq_lengths.shape, name="B", dtype=str(seq_lengths.dtype)) C = topi.reverse_sequence(A, B, seq_axis, batch_axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(C) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(C) - foo = tvm.build(s, [A, B, C], device, name="reverse_sequence") + foo = tvm.build(s, [A, B, C], target, name="reverse_sequence") - data_nd = tvm.nd.array(in_data, ctx) - seq_lengths_nd = tvm.nd.array(seq_lengths, ctx) - out_nd = tvm.nd.empty(in_data.shape, ctx=ctx, dtype=A.dtype) + data_nd = tvm.nd.array(in_data, dev) + seq_lengths_nd = tvm.nd.array(seq_lengths, dev) + out_nd = tvm.nd.empty(in_data.shape, device=dev, dtype=A.dtype) foo(data_nd, seq_lengths_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), ref_res) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32") result = [[0, 5, 10, 15], [4, 1, 6, 11], [8, 9, 2, 7], [12, 13, 14, 3]] @@ -366,16 +366,16 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"): else: out_tensor = topi.take(a=A, indices=indices, axis=axis, mode=mode) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out_tensor) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out_tensor) - foo = tvm.build(s, [A] + [indices] + [out_tensor], device, name="take") + foo = tvm.build(s, [A] + [indices] + [out_tensor], target, name="take") shape_size = 1 for i in range(len(src_shape)): shape_size = shape_size * src_shape[i] @@ -387,14 +387,14 @@ def check_device(device): else: np_mode = "raise" if mode == "fast" else mode out_npys = np.take(data_npy, indices_src, axis=axis, mode=np_mode) - data_nd = tvm.nd.array(data_npy, ctx) - indices_nd = tvm.nd.array(indices_src, ctx) - out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype) + data_nd = tvm.nd.array(data_npy, dev) + indices_nd = tvm.nd.array(indices_src, dev) + out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype) foo(data_nd, indices_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys) - for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: - check_device(device) + for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: + check_device(target) def verify_strided_slice(in_shape, begin, end, strides=None): @@ -402,25 +402,25 @@ def verify_strided_slice(in_shape, begin, end, strides=None): strides = [1, 1, 1] if strides is None else strides B = topi.strided_slice(A, begin, end, strides) + 1 - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) - foo = tvm.build(s, [A, B], device, name="stride_slice") + foo = tvm.build(s, [A, B], target, name="stride_slice") x_np = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1 - data_nd = tvm.nd.array(x_np, ctx) - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype) + data_nd = tvm.nd.array(x_np, dev) + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: - check_device(device) + for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: + check_device(target) def verify_dynamic_strided_slice(in_shape, begin, end, strides=None): @@ -431,28 +431,28 @@ def verify_dynamic_strided_slice(in_shape, begin, end, strides=None): strides = [1, 1, 1] if strides is None else strides B = topi.strided_slice(A, Begin, End, Strides) + 1 - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) - foo = tvm.build(s, [A, Begin, End, Strides, B], device, name="stride_slice") + foo = tvm.build(s, [A, Begin, End, Strides, B], target, name="stride_slice") x_np = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1 - data_nd = tvm.nd.array(x_np, ctx) - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype) - begin_nd = tvm.nd.array(np.array(begin).astype("int64"), ctx) - end_nd = tvm.nd.array(np.array(end).astype("int64"), ctx) - strides_nd = tvm.nd.array(np.array(strides).astype("int64"), ctx) + data_nd = tvm.nd.array(x_np, dev) + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype) + begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev) + end_nd = tvm.nd.array(np.array(end).astype("int64"), dev) + strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev) foo(data_nd, begin_nd, end_nd, strides_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: - check_device(device) + for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: + check_device(target) def verify_strided_set(in_shape, v_shape, begin, end, strides=None): @@ -466,39 +466,39 @@ def verify_strided_set(in_shape, v_shape, begin, end, strides=None): else: B = topi.strided_set(A, V, b, e) + 1 - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) if strides is not None: - foo = tvm.build(s, [A, V, b, e, st, B], device, name="stride_set") + foo = tvm.build(s, [A, V, b, e, st, B], target, name="stride_set") s_np = np.asarray(strides).astype("int32") - s_nd = tvm.nd.array(s_np, ctx) + s_nd = tvm.nd.array(s_np, dev) else: - foo = tvm.build(s, [A, V, b, e, B], device, name="stride_set") + foo = tvm.build(s, [A, V, b, e, B], target, name="stride_set") x_np = np.random.uniform(size=in_shape).astype(A.dtype) v_np = np.random.uniform(size=v_shape).astype(V.dtype) b_np = np.asarray(begin).astype("int32") e_np = np.asarray(end).astype("int32") out_npy = tvm.topi.testing.strided_set_python(x_np, v_np, begin, end, strides) + 1 - data_nd = tvm.nd.array(x_np, ctx) - v_nd = tvm.nd.array(v_np, ctx) - b_nd = tvm.nd.array(b_np, ctx) - e_nd = tvm.nd.array(e_np, ctx) - out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype) + data_nd = tvm.nd.array(x_np, dev) + v_nd = tvm.nd.array(v_np, dev) + b_nd = tvm.nd.array(b_np, dev) + e_nd = tvm.nd.array(e_np, dev) + out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype) if strides is not None: foo(data_nd, v_nd, b_nd, e_nd, s_nd, out_nd) else: foo(data_nd, v_nd, b_nd, e_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: - check_device(device) + for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]: + check_device(target) def verify_gather(data, axis, indices): @@ -509,22 +509,22 @@ def verify_gather(data, axis, indices): var_indices = te.placeholder(shape=indices.shape, dtype=indices.dtype.name, name="indices") out_tensor = topi.gather(var_data, axis, var_indices) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out_tensor) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out_tensor) - func = tvm.build(s, [var_data, var_indices, out_tensor], device, name="gather") + func = tvm.build(s, [var_data, var_indices, out_tensor], target, name="gather") out_npys = tvm.topi.testing.gather_python(data, axis, indices) - data_nd = tvm.nd.array(data, ctx) - indices_nd = tvm.nd.array(indices, ctx) - out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=data.dtype.name) + data_nd = tvm.nd.array(data, dev) + indices_nd = tvm.nd.array(indices, dev) + out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=data.dtype.name) func(data_nd, indices_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_gather_nd(src_shape, indices_src, indices_dtype): @@ -534,26 +534,26 @@ def verify_gather_nd(src_shape, indices_src, indices_dtype): indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices") out_tensor = topi.gather_nd(a=A, indices=indices) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(out_tensor) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(out_tensor) - func = tvm.build(s, [A, indices, out_tensor], device, name="take") + func = tvm.build(s, [A, indices, out_tensor], target, name="take") shape_size = 1 for i in range(len(src_shape)): shape_size = shape_size * src_shape[i] data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape)) out_npys = tvm.topi.testing.gather_nd_python(data_npy, indices_src) - data_nd = tvm.nd.array(data_npy, ctx) - indices_nd = tvm.nd.array(indices_src, ctx) - out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype) + data_nd = tvm.nd.array(data_npy, dev) + indices_nd = tvm.nd.array(indices_src, dev) + out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype) func(data_nd, indices_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_arange(start, stop, step): @@ -570,57 +570,57 @@ def verify_arange(start, stop, step): A = topi.arange(start, stop, step) a_np = np.arange(start, stop, step) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(A) - f = tvm.build(s, [A], device, name="arange") - a_nd = tvm.nd.empty(a_np.shape, dtype="float32", ctx=ctx) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(A) + f = tvm.build(s, [A], target, name="arange") + a_nd = tvm.nd.empty(a_np.shape, dtype="float32", device=dev) f(a_nd) tvm.testing.assert_allclose(a_nd.asnumpy(), a_np) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_repeat(in_shape, repeats, axis): A = te.placeholder(shape=in_shape, name="A") B = topi.repeat(A, repeats, axis) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="repeat") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="repeat") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.repeat(data_npy, repeats, axis) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_tile(in_shape, reps): A = te.placeholder(shape=in_shape, name="A") B = topi.tile(A, reps) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(B) - foo = tvm.build(s, [A, B], device, name="tile") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(B) + foo = tvm.build(s, [A, B], target, name="tile") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.tile(data_npy, reps) - data_nd = tvm.nd.array(data_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) + data_nd = tvm.nd.array(data_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_where(in_shape): @@ -630,24 +630,24 @@ def verify_where(in_shape): B = te.placeholder(shape=in_shape, name="B") C = topi.where(Cond, A, B) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_broadcast_schedule(device)(C) - f = tvm.build(s, [Cond, A, B, C], device, name="where") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_broadcast_schedule(target)(C) + f = tvm.build(s, [Cond, A, B, C], target, name="where") cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype) x_npy = np.random.uniform(size=in_shape).astype(dtype) y_npy = np.random.uniform(size=in_shape).astype(dtype) out_npy = np.where(cond_npy, x_npy, y_npy) - cond_nd = tvm.nd.array(cond_npy, ctx) - x_nd = tvm.nd.array(x_npy, ctx) - y_nd = tvm.nd.array(y_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx) + cond_nd = tvm.nd.array(cond_npy, dev) + x_nd = tvm.nd.array(x_npy, dev) + y_nd = tvm.nd.array(y_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev) f(cond_nd, x_nd, y_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype): @@ -658,21 +658,21 @@ def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype): indices, on_value_const, off_value_const, depth, axis, dtype ) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(one_hot_result) - fn = tvm.build(s, [indices, one_hot_result], device, name="one_hot") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(one_hot_result) + fn = tvm.build(s, [indices, one_hot_result], target, name="one_hot") indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype) out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype) - indices_nd = tvm.nd.array(indices_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(one_hot_result.dtype), ctx) + indices_nd = tvm.nd.array(indices_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(one_hot_result.dtype), dev) fn(indices_nd, out_nd) out_topi = out_nd.asnumpy() tvm.testing.assert_allclose(out_topi, out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_unravel_index(indices, shape, dtype): @@ -687,21 +687,21 @@ def verify_unravel_index(indices, shape, dtype): Y = te.placeholder(shape=y_data.shape, dtype=dtype, name="Y") Z = topi.unravel_index(X, Y) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(Z) - foo = tvm.build(s, [X, Y, Z], device, name="unravel_index") + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(Z) + foo = tvm.build(s, [X, Y, Z], target, name="unravel_index") out_npy = np.unravel_index(x_data, y_data) - datax_nd = tvm.nd.array(x_data, ctx) - datay_nd = tvm.nd.array(y_data, ctx) - out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=Z.dtype) + datax_nd = tvm.nd.array(x_data, dev) + datay_nd = tvm.nd.array(y_data, dev) + out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=Z.dtype) foo(datax_nd, datay_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected): @@ -724,27 +724,27 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ args = [A, B, C] D = topi.sparse_to_dense(A, output_shape, B, C) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(D) + def check_device(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(D) - foo = tvm.build(s, args + [D], device, name="sparse_to_dense") + foo = tvm.build(s, args + [D], target, name="sparse_to_dense") - sparse_indices_nd = tvm.nd.array(sparse_indices_data, ctx) - sparse_values_nd = tvm.nd.array(sparse_values_data, ctx) - out_nd = tvm.nd.empty(output_shape_data, ctx=ctx, dtype=B.dtype) + sparse_indices_nd = tvm.nd.array(sparse_indices_data, dev) + sparse_values_nd = tvm.nd.array(sparse_values_data, dev) + out_nd = tvm.nd.empty(output_shape_data, device=dev, dtype=B.dtype) if default_value is None: foo(sparse_indices_nd, sparse_values_nd, out_nd) else: - default_value_nd = tvm.nd.array(default_value_data, ctx) + default_value_nd = tvm.nd.array(default_value_data, dev) foo(sparse_indices_nd, sparse_values_nd, default_value_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), np.array(xpected)) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): @@ -752,24 +752,24 @@ def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT diagonal = te.placeholder(shape=diagonal_shape, name="diagonal", dtype=dtype) matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal, k, align) - def check_device(device, ctx): - ctx = tvm.context(device, 0) - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(matrix_set_diag_result) - fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], device, name="matrix_set_diag") + def check_device(target, dev): + dev = tvm.device(target, 0) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(matrix_set_diag_result) + fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], target, name="matrix_set_diag") input_npy = np.random.randint(-100, 100, size=input_shape).astype(dtype) diagonal_npy = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype) out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy, k, align) - input_nd = tvm.nd.array(input_npy, ctx) - diagonal_nd = tvm.nd.array(diagonal_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), ctx) + input_nd = tvm.nd.array(input_npy, dev) + diagonal_nd = tvm.nd.array(diagonal_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), dev) fn(input_nd, diagonal_nd, out_nd) out_topi = out_nd.asnumpy() tvm.testing.assert_allclose(out_topi, out_npy) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) def verify_adv_index(data_shape, index_shapes): @@ -785,27 +785,27 @@ def verify_adv_index(data_shape, index_shapes): np_out = np_data[tuple(np_indices)] out = topi.adv_index(data, indices) - def check_device(device, ctx): - ctx = tvm.context(device, 0) - if not ctx.exist: - print("Skip because %s is not enabled" % device) + def check_device(target, dev): + dev = tvm.device(target, 0) + if not dev.exist: + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.create(device): - s = tvm.topi.testing.get_injective_schedule(device)(out) + print("Running on target: %s" % target) + with tvm.target.create(target): + s = tvm.topi.testing.get_injective_schedule(target)(out) - func = tvm.build(s, [data] + indices + [out], device, name="adv_index") + func = tvm.build(s, [data] + indices + [out], target, name="adv_index") - nd_list = [tvm.nd.array(np_data, ctx)] + nd_list = [tvm.nd.array(np_data, dev)] for np_index in np_indices: - nd_list.append(tvm.nd.array(np_index, ctx)) - nd_list.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=data.dtype)) + nd_list.append(tvm.nd.array(np_index, dev)) + nd_list.append(tvm.nd.empty(out.shape, device=dev, dtype=data.dtype)) func(*nd_list) tvm.testing.assert_allclose(nd_list[-1].asnumpy(), np.array(np_out)) - for target, ctx in tvm.testing.enabled_targets(): - check_device(target, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_device(target, dev) @tvm.testing.uses_gpu @@ -896,14 +896,14 @@ def test_squeeze(): A = te.placeholder((2,), "float32", "A") E = topi.squeeze(A) C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype("int32")]) - for device in ["cuda", "opencl"]: - ctx = tvm.context(device, 0) - if tvm.testing.device_enabled(device): - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(C) + for target in ["cuda", "opencl"]: + dev = tvm.device(target, 0) + if tvm.testing.device_enabled(target): + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(C) func = tvm.build(s, [A, C]) - a = tvm.nd.array(np.array((1, 2)).astype("float32"), ctx=ctx) - c = tvm.nd.empty((1,), dtype="float32", ctx=ctx) + a = tvm.nd.array(np.array((1, 2)).astype("float32"), device=dev) + c = tvm.nd.empty((1,), dtype="float32", device=dev) func(a, c) assert c.asnumpy()[0] == 2 @@ -1040,18 +1040,18 @@ def test_layout_transform(): output = np.reshape(output, newshape=(1, 8, 8, 2, 16)) output = np.transpose(output, axes=(0, 3, 1, 2, 4)) - def check_device(device, ctx): - tvm_input = tvm.nd.array(input, ctx) - tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype) - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - f = tvm.build(s, [A, B], device, name="layout_transform") + def check_device(target, dev): + tvm_input = tvm.nd.array(input, dev) + tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=B.dtype) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + f = tvm.build(s, [A, B], target, name="layout_transform") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) - for backend, ctx in tvm.testing.enabled_targets(): - check_device(backend, ctx) + for backend, dev in tvm.testing.enabled_targets(): + check_device(backend, dev) @tvm.testing.uses_gpu @@ -1064,18 +1064,18 @@ def test_shape(): input = np.random.uniform(size=in_shape).astype(A.dtype) output = np.asarray(in_shape).astype(dtype) - def check_device(device, ctx): - tvm_input = tvm.nd.array(input, ctx) - tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype) - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - f = tvm.build(s, [A, B], device, name="shape") + def check_device(target, dev): + tvm_input = tvm.nd.array(input, dev) + tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=dtype) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + f = tvm.build(s, [A, B], target, name="shape") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) - for backend, ctx in tvm.testing.enabled_targets(): - check_device(backend, ctx) + for backend, dev in tvm.testing.enabled_targets(): + check_device(backend, dev) @tvm.testing.uses_gpu @@ -1092,19 +1092,19 @@ def test_sequence_mask(): B_data = np.random.randint(1, max_length, (batch_size,)).astype(np.int32) C_gt_data = tvm.topi.testing.sequence_mask(A_data, B_data, mask_value, axis) - def check_device(device, ctx): - tvm_A = tvm.nd.array(A_data, ctx) - tvm_B = tvm.nd.array(B_data, ctx) - tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32") - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(C) - f = tvm.build(s, [A, B, C], device, name="SequenceMask") + def check_device(target, dev): + tvm_A = tvm.nd.array(A_data, dev) + tvm_B = tvm.nd.array(B_data, dev) + tvm_C = tvm.nd.empty(in_shape, device=dev, dtype="float32") + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(C) + f = tvm.build(s, [A, B, C], target, name="SequenceMask") f(tvm_A, tvm_B, tvm_C) tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data) - for backend, ctx in tvm.testing.enabled_targets(): - check_device(backend, ctx) + for backend, dev in tvm.testing.enabled_targets(): + check_device(backend, dev) @tvm.testing.uses_gpu @@ -1117,28 +1117,28 @@ def test_ndarray_size(): input = np.random.uniform(size=in_shape).astype(A.dtype) output = np.asarray(np.size(input)).astype(dtype) - def check_device(device, ctx): - tvm_input = tvm.nd.array(input, ctx=ctx) - tvm_output = tvm.nd.empty((), ctx=ctx, dtype=B.dtype) - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - f = tvm.build(s, [A, B], device, name="ndarray_size") + def check_device(target, dev): + tvm_input = tvm.nd.array(input, device=dev) + tvm_output = tvm.nd.empty((), device=dev, dtype=B.dtype) + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + f = tvm.build(s, [A, B], target, name="ndarray_size") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) - for backend, ctx in tvm.testing.enabled_targets(): - check_device(backend, ctx) + for backend, dev in tvm.testing.enabled_targets(): + check_device(backend, dev) @tvm.testing.uses_gpu def test_where_fusion(): """integration test that where and zeros should be properly inlined""" - def check_device(device, ctx): - with tvm.target.Target(device): - print("Running on target: %s" % device) - conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(device) + def check_device(target, dev): + with tvm.target.Target(target): + print("Running on target: %s" % target) + conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(target) data = te.placeholder((2, 1, 2, 4), "int8", "data") w = te.placeholder((3, 1, 2, 2), "int8", "w") conv1 = conv2d_compute(data, w, 1, 0, 1, "int32") @@ -1152,8 +1152,8 @@ def check_device(device, ctx): s = conv2d_schedule(outs) tvm.build(s, [data, w, add], target=backend) - for backend, ctx in tvm.testing.enabled_targets(): - check_device(backend, ctx) + for backend, dev in tvm.testing.enabled_targets(): + check_device(backend, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py index d7ee74282922a..a97b95029862d 100644 --- a/tests/python/topi/python/test_topi_unique.py +++ b/tests/python/topi/python/test_topi_unique.py @@ -22,7 +22,7 @@ @tvm.testing.parametrize_targets -def test_unique(ctx, target): +def test_unique(dev, target): def calc_numpy_unique(data, is_sorted=False): uniq, index, inverse, counts = np.unique( data, return_index=True, return_inverse=True, return_counts=True @@ -56,10 +56,10 @@ def check_unique(data, is_sorted=False): ), } fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) - tvm_data = tvm.nd.array(data, ctx=ctx) - tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), ctx=ctx) - tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx) - tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), ctx=ctx) + tvm_data = tvm.nd.array(data, device=dev) + tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), device=dev) + tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev) + tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), device=dev) # without counts with tvm.target.Target(target): @@ -76,7 +76,7 @@ def check_unique(data, is_sorted=False): np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5) # with counts - tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx) + tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev) with tvm.target.Target(target): te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype)) outs = fcompute(te_input, True) @@ -106,6 +106,6 @@ def check_unique(data, is_sorted=False): if __name__ == "__main__": - test_unique(tvm.context("cpu"), tvm.target.Target("llvm")) - test_unique(tvm.context("cuda"), tvm.target.Target("cuda")) - test_unique(tvm.context("nvptx"), tvm.target.Target("nvptx")) + test_unique(tvm.device("cpu"), tvm.target.Target("llvm")) + test_unique(tvm.device("cuda"), tvm.target.Target("cuda")) + test_unique(tvm.device("nvptx"), tvm.target.Target("nvptx")) diff --git a/tests/python/topi/python/test_topi_upsampling.py b/tests/python/topi/python/test_topi_upsampling.py index 0408220bfd65e..5900437608206 100644 --- a/tests/python/topi/python/test_topi_upsampling.py +++ b/tests/python/topi/python/test_topi_upsampling.py @@ -84,19 +84,19 @@ def verify_upsampling( else: b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu @@ -228,19 +228,19 @@ def verify_upsampling3d( else: b_np = tvm.topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout) - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - s = tvm.topi.testing.get_injective_schedule(device)(B) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) - f = tvm.build(s, [A, B], device) + def check_target(target, dev): + print("Running on target: %s" % target) + with tvm.target.Target(target): + s = tvm.topi.testing.get_injective_schedule(target)(B) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) + f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for device, ctx in tvm.testing.enabled_targets(): - check_device(device, ctx) + for target, dev in tvm.testing.enabled_targets(): + check_target(target, dev) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 2fdf3cf4b170f..7f8712c55fd19 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -89,31 +89,31 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): np_out2[i, j, k] = -1.0 np_out3[i, j] = -1 - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _get_valid_counts_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _get_valid_counts_implement) data = te.placeholder(dshape, name="data", dtype=dtype) outs = fcompute(data, score_threshold, id_index, score_index) s = fschedule(outs) - tvm_input_data = tvm.nd.array(np_data, ctx) - tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx) - tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx) - tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx) + tvm_input_data = tvm.nd.array(np_data, dev) + tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), dev) + tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), dev) + tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), dev) - f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device) + f = tvm.build(s, [data, outs[0], outs[1], outs[2]], target) f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3) tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3) - for device in ["llvm", "cuda", "opencl", "vulkan"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "vulkan"]: + check_device(target) @tvm.testing.uses_gpu @@ -146,14 +146,14 @@ def verify_non_max_suppression( valid_count = te.placeholder((batch,), dtype="int32", name="valid_count") indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices") - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _nms_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _nms_implement) out = fcompute( data, valid_count, @@ -183,22 +183,22 @@ def check_device(device): s = fschedule(out) indices_s = fschedule(indices_out) - tvm_data = tvm.nd.array(np_data, ctx) - tvm_valid_count = tvm.nd.array(np_valid_count, ctx) - tvm_indices = tvm.nd.array(np_indices, ctx) + tvm_data = tvm.nd.array(np_data, dev) + tvm_valid_count = tvm.nd.array(np_valid_count, dev) + tvm_indices = tvm.nd.array(np_indices, dev) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) - f = tvm.build(s, [data, valid_count, indices, out], device) + tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) + f = tvm.build(s, [data, valid_count, indices, out], target) f(tvm_data, tvm_valid_count, tvm_indices, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) - tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) - f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) + tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev) + f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target) f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) - for device in ["llvm", "cuda", "opencl", "nvptx"]: - check_device(device) + for target in ["llvm", "cuda", "opencl", "nvptx"]: + check_device(target) @tvm.testing.uses_gpu @@ -339,26 +339,26 @@ def verify_multibox_prior( if clip: np_out = np.clip(np_out, 0, 1) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) + print("Running on target: %s" % target) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _multibox_prior_implement) - with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_prior_implement) + with tvm.target.Target(target): out = fcompute(data, sizes, ratios, steps, offsets, clip) s = fschedule(out) - tvm_input_data = tvm.nd.array(input_data, ctx) - tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx) - f = tvm.build(s, [data, out], device) + tvm_input_data = tvm.nd.array(input_data, dev) + tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), dev) + f = tvm.build(s, [data, out], target) f(tvm_input_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3) - for device in ["llvm", "opencl", "cuda"]: - check_device(device) + for target in ["llvm", "opencl", "cuda"]: + check_device(target) @tvm.testing.uses_gpu @@ -394,28 +394,28 @@ def test_multibox_detection(): ] ) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) + print("Running on target: %s" % target) - fcompute, fschedule = tvm.topi.testing.dispatch(device, _multibox_detection_implement) - with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_detection_implement) + with tvm.target.Target(target): out = fcompute(cls_prob, loc_preds, anchors) s = fschedule(out) - tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx) - tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx) - tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), ctx) - tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), ctx) - f = tvm.build(s, [cls_prob, loc_preds, anchors, out], device) + tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), dev) + tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), dev) + tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), dev) + tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), dev) + f = tvm.build(s, [cls_prob, loc_preds, anchors, out], target) f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4) - for device in ["llvm", "opencl", "cuda"]: - check_device(device) + for target in ["llvm", "opencl", "cuda"]: + check_device(target) def verify_roi_align( @@ -445,13 +445,13 @@ def get_ref_data(): a_np, rois_np, b_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _roi_align_implement) b = fcompute( a, rois, @@ -462,16 +462,16 @@ def check_device(device): ) s = fschedule(b) - tvm_a = tvm.nd.array(a_np, ctx) - tvm_rois = tvm.nd.array(rois_np, ctx) - tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) - f = tvm.build(s, [a, rois, b], device) + tvm_a = tvm.nd.array(a_np, dev) + tvm_rois = tvm.nd.array(rois_np, dev) + tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev) + f = tvm.build(s, [a, rois, b], target) f(tvm_a, tvm_rois, tvm_b) tvm_val = tvm_b.asnumpy() tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4) - for device in ["llvm", "cuda", "opencl"]: - check_device(device) + for target in ["llvm", "cuda", "opencl"]: + check_device(target) @tvm.testing.uses_gpu @@ -506,29 +506,29 @@ def get_ref_data(): a_np, rois_np, b_np = get_ref_data() - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) + print("Running on target: %s" % target) - with tvm.target.Target(device): + with tvm.target.Target(target): b = topi.vision.rcnn.roi_pool_nchw( a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale ) - s_func = tvm.topi.testing.dispatch(device, _roi_pool_schedule) + s_func = tvm.topi.testing.dispatch(target, _roi_pool_schedule) s = s_func(b) - tvm_a = tvm.nd.array(a_np, ctx) - tvm_rois = tvm.nd.array(rois_np, ctx) - tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) - f = tvm.build(s, [a, rois, b], device) + tvm_a = tvm.nd.array(a_np, dev) + tvm_rois = tvm.nd.array(rois_np, dev) + tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev) + f = tvm.build(s, [a, rois, b], target) f(tvm_a, tvm_rois, tvm_b) tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-4) - for device in ["cuda", "llvm"]: - check_device(device) + for target in ["cuda", "llvm"]: + check_device(target) @tvm.testing.uses_gpu @@ -542,26 +542,26 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs): bbox_pred = te.placeholder(np_bbox_pred.shape) im_info = te.placeholder(np_im_info.shape) - def check_device(device): - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) + def check_device(target): + dev = tvm.device(target, 0) + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) return - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch(device, _proposal_implement) + print("Running on target: %s" % target) + with tvm.target.Target(target): + fcompute, fschedule = tvm.topi.testing.dispatch(target, _proposal_implement) out = fcompute(cls_prob, bbox_pred, im_info, **attrs) s = fschedule(out) - f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device) - tvm_cls_prob = tvm.nd.array(np_cls_prob, ctx=ctx) - tvm_bbox_pred = tvm.nd.array(np_bbox_pred, ctx=ctx) - tvm_im_info = tvm.nd.array(np_im_info, ctx=ctx) - tvm_out = tvm.nd.empty(ctx=ctx, shape=out.shape, dtype=out.dtype) + f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], target) + tvm_cls_prob = tvm.nd.array(np_cls_prob, device=dev) + tvm_bbox_pred = tvm.nd.array(np_bbox_pred, device=dev) + tvm_im_info = tvm.nd.array(np_im_info, device=dev) + tvm_out = tvm.nd.empty(device=dev, shape=out.shape, dtype=out.dtype) f(tvm_cls_prob, tvm_bbox_pred, tvm_im_info, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-4) - for device in ["llvm", "cuda"]: - check_device(device) + for target in ["llvm", "cuda"]: + check_device(target) @tvm.testing.uses_gpu diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py index 795c3cb3b0a26..91430599124e7 100644 --- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py +++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py @@ -119,16 +119,16 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed(): func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) - ctx = tvm.context(str(target)) - ctx_ref = tvm.cpu() + dev = tvm.device(str(target)) + dev_ref = tvm.cpu() - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref] - ctx.sync() + args = [tvm.nd.array(x, device=dev) for x in np_args] + args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args_ref] + dev.sync() func(*args) func_ref(*args_ref) - ctx.sync() + dev.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3) @@ -166,16 +166,16 @@ def test_correctness_layout_rewrite_insert_transform_stage(): func = tvm.build(s, bufs, target=target) func_ref = tvm.build(s_ref, bufs_ref, target=target) - ctx = tvm.context(str(target)) - ctx_ref = tvm.cpu() + dev = tvm.device(str(target)) + dev_ref = tvm.cpu() - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] - ctx.sync() + args = [tvm.nd.array(x, device=dev) for x in np_args] + args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args] + dev.sync() func(*args) func_ref(*args_ref) - ctx.sync() + dev.sync() tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3) tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3) diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py index 30aafbd223902..32245d9bba814 100644 --- a/tests/python/unittest/test_auto_scheduler_search_policy.py +++ b/tests/python/unittest/test_auto_scheduler_search_policy.py @@ -95,7 +95,7 @@ def search_common( sch, args = task.compute_dag.apply_steps_from_state(task.compute_dag.init_state) mod_ref = tvm.build(sch, args, "llvm") - ctx = tvm.context(str(target), 0) + ctx = tvm.device(str(target), 0) np_arrays = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args] tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays] diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py index 1bd24c931b723..c9dfef4654ed5 100644 --- a/tests/python/unittest/test_crt.py +++ b/tests/python/unittest/test_crt.py @@ -92,11 +92,11 @@ def test_compile_runtime(): workspace = tvm.micro.Workspace() with _make_add_sess(workspace) as sess: - A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device) assert (A_data.asnumpy() == np.array([2, 3])).all() - B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context) + B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device) assert (B_data.asnumpy() == np.array([4])).all() - C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context) + C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device) assert (C_data.asnumpy() == np.array([0, 0])).all() system_lib = sess.get_system_lib() @@ -157,11 +157,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) { with _make_session(workspace, factory.get_lib()) as sess: graph_mod = tvm.micro.create_local_graph_runtime( - factory.get_json(), sess.get_system_lib(), sess.context + factory.get_json(), sess.get_system_lib(), sess.device ) - A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device) assert (A_data.asnumpy() == np.array([2, 3])).all() - B_data = tvm.nd.array(np.array([4, 7], dtype="uint8"), ctx=sess.context) + B_data = tvm.nd.array(np.array([4, 7], dtype="uint8"), device=sess.device) assert (B_data.asnumpy() == np.array([4, 7])).all() graph_mod.run(a=A_data, b=B_data) @@ -178,11 +178,11 @@ def test_std_math_functions(): workspace = tvm.micro.Workspace() with _make_add_sess(workspace) as sess: - A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device) assert (A_data.asnumpy() == np.array([2, 3])).all() - B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context) + B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device) assert (B_data.asnumpy() == np.array([4])).all() - C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context) + C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device) assert (C_data.asnumpy() == np.array([0, 0])).all() system_lib = sess.get_system_lib() @@ -194,8 +194,8 @@ def test_std_math_functions(): s = tvm.te.create_schedule(B.op) with _make_sess_from_op(workspace, "myexpf", s, [A, B]) as sess: - A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context) - B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device) + B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device) lib = sess.get_system_lib() func = lib["myexpf"] func(A_data, B_data) @@ -213,11 +213,11 @@ def test_platform_timer(): s = tvm.te.create_schedule(B.op) with _make_sess_from_op(workspace, "myexpf", s, [A, B]) as sess: - A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context) - B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context) + A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device) + B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device) lib = sess.get_system_lib() time_eval_f = lib.time_evaluator( - "myexpf", sess.context, number=2000, repeat=3, min_repeat_ms=40 + "myexpf", sess.device, number=2000, repeat=3, min_repeat_ms=40 ) result = time_eval_f(A_data, B_data) assert result.mean > 0 diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index ffe859927ad7a..9fde832d08ce1 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -384,7 +384,7 @@ def test_crt_link_params(): flasher = compiler.flasher(**flasher_kw) with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess: graph_rt = tvm.micro.session.create_local_graph_runtime( - graph_json, sess.get_system_lib(), sess.context + graph_json, sess.get_system_lib(), sess.device ) # NOTE: not setting params here. diff --git a/tests/python/unittest/test_runtime_container.py b/tests/python/unittest/test_runtime_container.py index 3e89a51cc93a5..4607892a5a4c2 100644 --- a/tests/python/unittest/test_runtime_container.py +++ b/tests/python/unittest/test_runtime_container.py @@ -47,7 +47,7 @@ def test_tuple_object(): fn = relay.Function([x], relay.expr.TupleGetItem(x, 0)) mod = tvm.IRModule.from_expr(fn) - exe = relay.create_executor(kind="vm", mod=mod, ctx=nd.cpu(), target="llvm") + exe = relay.create_executor(kind="vm", mod=mod, device=nd.cpu(), target="llvm") f = exe.evaluate() value_tuple = _container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))]) # pass an ADT object to evaluate diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py index fe33c0f4f999e..f37d4089a8ee3 100644 --- a/tests/python/unittest/test_runtime_graph.py +++ b/tests/python/unittest/test_runtime_graph.py @@ -70,15 +70,15 @@ def check_remote(): server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = utils.tempdir() - ctx = remote.cpu(0) + dev = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") mod = graph_runtime.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) - mod.run(x=tvm.nd.array(a, ctx)) - out = tvm.nd.empty((n,), ctx=ctx) + mod.run(x=tvm.nd.array(a, dev)) + out = tvm.nd.empty((n,), device=dev) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1) diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py index 4a31873cb93cb..41c782a91d9b7 100644 --- a/tests/python/unittest/test_runtime_graph_cuda_graph.py +++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py @@ -73,9 +73,9 @@ def test_graph_simple(): def check_verify(): mlib = tvm.build(s, [A, B], "cuda", name="myadd") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) try: - mod = cuda_graph_runtime.create(graph, mlib, ctx) + mod = cuda_graph_runtime.create(graph, mlib, dev) except ValueError: return diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py index 996d426efaa99..b0173d1e7b7f2 100644 --- a/tests/python/unittest/test_runtime_graph_debug.py +++ b/tests/python/unittest/test_runtime_graph_debug.py @@ -165,7 +165,7 @@ def check_remote(): server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) temp = utils.tempdir() - ctx = remote.cpu(0) + dev = remote.cpu(0) path_dso = temp.relpath("dev_lib.so") mlib.export_library(path_dso) remote.upload(path_dso) @@ -176,8 +176,8 @@ def check_remote(): print("Skip because debug runtime not enabled") return a = np.random.uniform(size=(n,)).astype(A.dtype) - mod.run(x=tvm.nd.array(a, ctx)) - out = tvm.nd.empty((n,), ctx=ctx) + mod.run(x=tvm.nd.array(a, dev)) + out = tvm.nd.empty((n,), device=dev) out = mod.get_output(0, out) np.testing.assert_equal(out.asnumpy(), a + 1) diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py index 161f944ea7bb9..b3fbc2fdf2483 100644 --- a/tests/python/unittest/test_runtime_heterogeneous.py +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -130,7 +130,7 @@ def test_simplex_data_transferring(): """ host = "cpu" target_host = "llvm" - host_ctx = tvm.context(host) + host_dev = tvm.device(host) if not tvm.runtime.enabled(target_host): print("Skip test because llvm is not enabled.") return @@ -140,8 +140,8 @@ def check_device(device, target_device): print("Skip test because {} is not enabled.".format(target_device)) return - device_ctx = tvm.context(device) - graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type) + device_dev = tvm.device(device) + graph = get_simplex_graph(host_dev.device_type, device_dev.device_type) shape = (4,) # Create module for add whose target is the device. @@ -171,8 +171,8 @@ def check_device(device, target_device): target_flist = {target_device: lower_add, target_host: lower_sub} mhost = tvm.build(target_flist, target_host=target_host) - ctx = [host_ctx, device_ctx] - mod = graph_runtime.create(graph, mhost, ctx) + dev = [host_dev, device_dev] + mod = graph_runtime.create(graph, mhost, dev) params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype) @@ -348,7 +348,7 @@ def test_duplex_data_transferring(): """ host = "cpu" target_host = "llvm" - host_ctx = tvm.context(host) + host_dev = tvm.device(host) if not tvm.runtime.enabled(target_host): print("Skip test because llvm is not enabled.") return @@ -358,8 +358,8 @@ def check_device(device, target_device): print("Skip test because {} is not enabled.".format(target_device)) return - device_ctx = tvm.context(device) - graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type) + device_dev = tvm.device(device) + graph = get_duplex_graph(host_dev.device_type, device_dev.device_type) shape = (4,) # Insert copy nodes for data transferring between add and sub nodes. @@ -400,7 +400,7 @@ def check_device(device, target_device): lower_add0.update(lower_add1) target_flist = {target_device: lower_add0, target_host: lower_sub} mhost = tvm.build(target_flist, target_host=target_host) - ctx = [host_ctx, device_ctx] + dev = [host_dev, device_dev] params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype) @@ -408,7 +408,7 @@ def check_device(device, target_device): params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype) def check_verify(): - mod = graph_runtime.create(graph, mhost, ctx) + mod = graph_runtime.create(graph, mhost, dev) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) @@ -422,7 +422,7 @@ def check_load_module(): out_file.write(graph) loaded_lib = tvm.runtime.load_module(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() - mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) + mod = graph_runtime.create(loaded_graph, loaded_lib, dev) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py index 930011d4fd333..a9f8f6352d359 100644 --- a/tests/python/unittest/test_runtime_module_based_interface.py +++ b/tests/python/unittest/test_runtime_module_based_interface.py @@ -36,8 +36,8 @@ def verify(data): with relay.build_config(opt_level=3): graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params) - ctx = tvm.cpu() - module = graph_runtime.create(graph, lib, ctx) + dev = tvm.cpu() + module = graph_runtime.create(graph, lib, dev) module.set_input("data", data) module.set_input(**graph_params) module.run() @@ -54,8 +54,8 @@ def test_legacy_compatibility(): with relay.build_config(opt_level=3): graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.cpu() - module = graph_runtime.create(graph, lib, ctx) + dev = tvm.cpu() + module = graph_runtime.create(graph, lib, dev) module.set_input("data", data) module.set_input(**graph_params) module.run() @@ -72,8 +72,8 @@ def test_cpu(): complied_graph_lib = relay.build_module.build(mod, "llvm", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") # raw api - ctx = tvm.cpu() - gmod = complied_graph_lib["default"](ctx) + dev = tvm.cpu() + gmod = complied_graph_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] @@ -83,7 +83,7 @@ def test_cpu(): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(complied_graph_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -97,10 +97,10 @@ def test_gpu(): with relay.build_config(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.gpu() + dev = tvm.gpu() # raw api - gmod = complied_graph_lib["default"](ctx) + gmod = complied_graph_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] @@ -110,7 +110,7 @@ def test_gpu(): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(complied_graph_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -138,8 +138,8 @@ def verify_cpu_export(obj_format): path_lib = temp.relpath(file_name) complied_graph_lib.export_library(path_lib) loaded_lib = tvm.runtime.load_module(path_lib) - ctx = tvm.cpu(0) - gmod = loaded_lib["default"](ctx) + dev = tvm.cpu(0) + gmod = loaded_lib["default"](dev) # raw api set_input = gmod["set_input"] @@ -152,7 +152,7 @@ def verify_cpu_export(obj_format): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -178,10 +178,10 @@ def verify_gpu_export(obj_format): complied_graph_lib.export_library(path_lib) loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.gpu() + dev = tvm.gpu() # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] @@ -191,7 +191,7 @@ def verify_gpu_export(obj_format): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -222,20 +222,20 @@ def verify_rpc_cpu_export(obj_format): remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = remote.cpu() + dev = remote.cpu() # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] - set_input("data", tvm.nd.array(data, ctx=ctx)) + set_input("data", tvm.nd.array(data, device=dev)) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -267,20 +267,20 @@ def verify_rpc_gpu_export(obj_format): remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = remote.gpu() + dev = remote.gpu() # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] - set_input("data", tvm.nd.array(data, ctx=ctx)) + set_input("data", tvm.nd.array(data, device=dev)) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -318,10 +318,10 @@ def verify_cpu_remove_package_params(obj_format): fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] @@ -334,7 +334,7 @@ def verify_cpu_remove_package_params(obj_format): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -365,10 +365,10 @@ def verify_gpu_remove_package_params(obj_format): fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] @@ -381,7 +381,7 @@ def verify_gpu_remove_package_params(obj_format): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -418,23 +418,23 @@ def verify_rpc_cpu_remove_package_params(obj_format): remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = remote.cpu() + dev = remote.cpu() # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] load_params = gmod["load_params"] loaded_params = bytearray(open(path_params, "rb").read()) - set_input("data", tvm.nd.array(data, ctx=ctx)) + set_input("data", tvm.nd.array(data, device=dev)) load_params(loaded_params) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -471,23 +471,23 @@ def verify_rpc_gpu_remove_package_params(obj_format): remote.upload(path_lib) loaded_lib = remote.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = remote.gpu() + dev = remote.gpu() # raw api - gmod = loaded_lib["default"](ctx) + gmod = loaded_lib["default"](dev) set_input = gmod["set_input"] run = gmod["run"] get_output = gmod["get_output"] load_params = gmod["load_params"] loaded_params = bytearray(open(path_params, "rb").read()) - set_input("data", tvm.nd.array(data, ctx=ctx)) + set_input("data", tvm.nd.array(data, device=dev)) load_params(loaded_params) run() out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -512,9 +512,9 @@ def test_debug_graph_runtime(): data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") # raw api - ctx = tvm.cpu() + dev = tvm.cpu() try: - gmod = complied_graph_lib["debug_create"]("default", ctx) + gmod = complied_graph_lib["debug_create"]("default", dev) except: print("Skip because debug graph_runtime not enabled") return @@ -528,8 +528,8 @@ def test_debug_graph_runtime(): # debug graph runtime wrapper debug_g_mod = debug_runtime.GraphModuleDebug( - complied_graph_lib["debug_create"]("default", ctx), - [ctx], + complied_graph_lib["debug_create"]("default", dev), + [dev], complied_graph_lib.get_json(), None, ) @@ -546,9 +546,9 @@ def test_cuda_graph_runtime(): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") - ctx = tvm.gpu() + dev = tvm.gpu() try: - gmod = complied_graph_lib["cuda_graph_create"](ctx) + gmod = complied_graph_lib["cuda_graph_create"](dev) except: print("Skip because cuda_graph not enabled") return diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py index 38800e8de6ad4..c34b2f292d331 100644 --- a/tests/python/unittest/test_runtime_module_load.py +++ b/tests/python/unittest/test_runtime_module_load.py @@ -105,7 +105,7 @@ def test_device_module_dump(): s[B].bind(tx, te.thread_axis("threadIdx.x")) def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -123,8 +123,8 @@ def check_device(device): f.export_library(path_dso, cc.cross_compiler("g++")) f1 = tvm.runtime.load_module(path_dso) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) f1(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if sys.platform != "win32": @@ -133,7 +133,7 @@ def check_device(device): np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_stackvm(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -143,8 +143,8 @@ def check_stackvm(device): path_dso = temp.relpath("dev_lib.stackvm") f.export_library(path_dso) f1 = tvm.runtime.load_module(path_dso) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) f(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -163,7 +163,7 @@ def test_combine_module_llvm(): s = te.create_schedule(B.op) def check_llvm(): - ctx = tvm.cpu(0) + dev = tvm.cpu(0) if not tvm.testing.device_enabled("llvm"): print("Skip because llvm is not enabled") return @@ -180,15 +180,15 @@ def check_llvm(): m = tvm.runtime.load_module(path_dso) fadd1 = m["myadd1"] fadd2 = m["myadd2"] - a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev) fadd1(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) fadd2(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_system_lib(): - ctx = tvm.cpu(0) + dev = tvm.cpu(0) if not tvm.testing.device_enabled("llvm"): print("Skip because llvm is not enabled") return @@ -205,8 +205,8 @@ def check_system_lib(): dll = ctypes.CDLL(path_dso) # Load the system wide library mm = tvm.runtime.system_lib() - a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev) mm["myadd1"](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) mm["myadd2"](a, b) diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py index 11c109810fbb2..7ec09d8b9b486 100644 --- a/tests/python/unittest/test_runtime_rpc.py +++ b/tests/python/unittest/test_runtime_rpc.py @@ -67,9 +67,9 @@ def verify_rpc(remote, target, shape, dtype): s = te.create_schedule(B.op) f = tvm.build(s, [A, B], target, name="myadd") - ctx = remote.cpu(0) - a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx) - b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx) + dev = remote.cpu(0) + a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev) + b = tvm.nd.array(np.zeros(shape).astype(A.dtype), device=dev) temp = utils.tempdir() path_dso = temp.relpath("dev_lib.o") f.save(path_dso) @@ -143,7 +143,7 @@ def test_rpc_array(): server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) r_cpu = tvm.nd.array(x, remote.cpu(0)) - assert str(r_cpu.context).startswith("remote") + assert str(r_cpu.device).startswith("remote") np.testing.assert_equal(r_cpu.asnumpy(), x) fremote = remote.get_function("rpc.test.remote_array_func") fremote(r_cpu) @@ -154,11 +154,11 @@ def test_rpc_large_array(): # testcase of large array creation server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) - ctx = remote.cpu(0) + dev = remote.cpu(0) a_np = np.ones((5041, 720)).astype("float32") b_np = np.ones((720, 192)).astype("float32") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) np.testing.assert_equal(a.asnumpy(), a_np) np.testing.assert_equal(b.asnumpy(), b_np) @@ -238,14 +238,14 @@ def test_rpc_remote_module(): def check_remote(remote): temp = utils.tempdir() - ctx = remote.cpu(0) + dev = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") - a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) @@ -278,11 +278,11 @@ def check_minrpc(): # statrt the minrpc session. remote = tvm.rpc.PopenSession(path_minrpc) - ctx = remote.cpu(0) + dev = remote.cpu(0) f1 = remote.system_lib() - a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1) cost = time_f(a, b).mean np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -304,7 +304,7 @@ def check_remote_link_cl(remote): print("Skip because opencl is not enabled") return temp = utils.tempdir() - ctx = remote.cl(0) + dev = remote.cl(0) s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, te.thread_axis("blockIdx.x")) @@ -323,8 +323,8 @@ def check_remote_link_cl(remote): fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) - a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler @@ -332,8 +332,8 @@ def check_remote_link_cl(remote): f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") - a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -377,7 +377,7 @@ def check_multi_hop(): assert fecho("xyz") == "xyz" assert bytes(fecho(bytearray(b"123"))) == b"123" - nd = tvm.nd.array([1, 2, 3], ctx=client.cpu(0)) + nd = tvm.nd.array([1, 2, 3], device=client.cpu(0)) assert nd.asnumpy()[1] == 2 def check_error_handling(): diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py index 29715222e4299..c1c94211a664c 100644 --- a/tests/python/unittest/test_runtime_vm_profiler.py +++ b/tests/python/unittest/test_runtime_vm_profiler.py @@ -26,9 +26,9 @@ def test_basic(): if not profiler_vm.enabled(): return - for target, ctx in enabled_targets(): + for target, dev in enabled_targets(): exe = relay.vm.compile(mod, target, params=params) - vm = profiler_vm.VirtualMachineProfiler(exe, ctx) + vm = profiler_vm.VirtualMachineProfiler(exe, dev) data = np.random.rand(1, 3, 224, 224).astype("float32") res = vm.invoke("main", [data]) diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py index dc42381cf82d4..3429b39c6ac38 100644 --- a/tests/python/unittest/test_target_codegen_blob.py +++ b/tests/python/unittest/test_target_codegen_blob.py @@ -38,8 +38,8 @@ def verify(data): mod, params = relay.testing.synthetic.get_workload(input_shape=input_shape) with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, "llvm", params=params) - ctx = tvm.cpu() - module = graph_runtime.GraphModule(lib["default"](ctx)) + dev = tvm.cpu() + module = graph_runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() out = module.get_output(0).asnumpy() @@ -57,8 +57,8 @@ def verify(data): loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape).astype("float32") - ctx = tvm.gpu() - module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + dev = tvm.gpu() + module = graph_runtime.GraphModule(loaded_lib["default"](dev)) module.set_input("data", data) module.run() out = module.get_output(0).asnumpy() @@ -68,7 +68,7 @@ def verify(data): @tvm.testing.uses_gpu def test_cuda_lib(): - ctx = tvm.gpu(0) + dev = tvm.gpu(0) for device in ["llvm", "cuda"]: if not tvm.testing.device_enabled(device): print("skip because %s is not enabled..." % device) @@ -89,8 +89,8 @@ def test_cuda_lib(): path_lib = temp.relpath("deploy_lib.so") fn_add.export_library(path_lib) m = tvm.runtime.load_module(path_lib) - a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev) m["add"](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) diff --git a/tests/python/unittest/test_target_codegen_bool.py b/tests/python/unittest/test_target_codegen_bool.py index f4b5f90435d21..527741a5ef0dc 100644 --- a/tests/python/unittest/test_target_codegen_bool.py +++ b/tests/python/unittest/test_target_codegen_bool.py @@ -39,11 +39,11 @@ def check_llvm(): s[C].parallel(xo2) # BUILD and invoke the kernel. f = tvm.build(s, [A, B, D], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) a_np = np.random.uniform(size=n).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev) f(a, b, d) np.testing.assert_equal( d.asnumpy(), @@ -53,7 +53,7 @@ def check_llvm(): def check_device(device): if not tvm.testing.device_enabled(device): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) s = te.create_schedule(D.op) for stage in [C, D]: xo, xi = s[stage].split(stage.op.axis[0], factor=4) @@ -61,9 +61,9 @@ def check_device(device): s[stage].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B, D], device) a_np = np.random.uniform(size=n).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev) f(a, b, d) np.testing.assert_equal( d.asnumpy(), diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py index d1ca8b1450f00..6102beba0d257 100644 --- a/tests/python/unittest/test_target_codegen_c_host.py +++ b/tests/python/unittest/test_target_codegen_c_host.py @@ -36,12 +36,12 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) fadd = m["test_fadd"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -81,12 +81,12 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) fadd = m["test_fadd_pipeline"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -109,10 +109,10 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) fadd = m["test_reinterpret"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = nn - a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) fadd(a, b) tvm.testing.assert_allclose(b.asnumpy(), (2 + a.asnumpy()).view("float32")) @@ -133,10 +133,10 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) fceil = m["test_ceil"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = nn - a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) fceil(a, b) tvm.testing.assert_allclose(b.asnumpy(), (np.ceil(a.asnumpy()).view("float32"))) @@ -157,10 +157,10 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) ffloor = m["test_floor"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = nn - a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) ffloor(a, b) tvm.testing.assert_allclose(b.asnumpy(), (np.floor(a.asnumpy()).view("float32"))) @@ -181,10 +181,10 @@ def check_c(): mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) fround = m["test_round"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = nn - a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) fround(a, b) tvm.testing.assert_allclose(b.asnumpy(), (np.round(a.asnumpy()).view("float32"))) diff --git a/tests/python/unittest/test_target_codegen_cross_llvm.py b/tests/python/unittest/test_target_codegen_cross_llvm.py index a55530a090e44..feb1d43d5ce2b 100644 --- a/tests/python/unittest/test_target_codegen_cross_llvm.py +++ b/tests/python/unittest/test_target_codegen_cross_llvm.py @@ -78,11 +78,11 @@ def build_arm(): if remote: remote.upload(path) farm = remote.load_module("myadd.o") - ctx = remote.cpu(0) + dev = remote.cpu(0) n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) farm(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..") diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py index 06d7cb4bb7bb1..a63aeaa40f9d2 100644 --- a/tests/python/unittest/test_target_codegen_cuda.py +++ b/tests/python/unittest/test_target_codegen_cuda.py @@ -46,9 +46,9 @@ def check_cuda(dtype, n, lanes): s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), B.dtype, ctx) + dev = tvm.gpu(0) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) + c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) @@ -99,13 +99,13 @@ def check_cuda(n, lanes): disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"] ): fun = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) np_a = np.random.uniform(size=(n, lanes)).astype("float32") np_a = np_bf162np_float(np_float2np_bf16(np_a)) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_float2np_bf16(np_a)) - c = tvm.nd.empty((n,), B.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_float2np_bf16(np_a)) + c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) - c = tvm.nd.empty((n, lanes), "uint16", ctx).copyfrom(c) + c = tvm.nd.empty((n, lanes), "uint16", dev).copyfrom(c) tvm.testing.assert_allclose(c.asnumpy(), np_float2np_bf16(np_a + 1)) check_cuda(64, 2) @@ -138,11 +138,11 @@ def check_cuda(dtype, n, lanes): np_b = np.random.randint(low=-128, high=127, size=(n, lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] - ctx = tvm.gpu(0) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) - b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) - c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) - d = tvm.nd.empty((n,), D.dtype, ctx) + dev = tvm.gpu(0) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a) + b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np_b) + c = tvm.nd.empty((n,), C.dtype, dev).copyfrom(np_c) + d = tvm.nd.empty((n,), D.dtype, dev) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d) @@ -155,7 +155,7 @@ def test_cuda_vectorize_load(): num_thread = 8 def check_cuda(dtype, n, lanes): - ctx = tvm.gpu(0) + dev = tvm.gpu(0) A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i], name="B") s = te.create_schedule(B.op) @@ -164,8 +164,8 @@ def check_cuda(dtype, n, lanes): s[B].bind(thread, tx) fun = tvm.build(s, [A, B], "cuda", name="vector_load") np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) - b = tvm.nd.empty((n,), B.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a) + b = tvm.nd.empty((n,), B.dtype, dev) fun(a, b) tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy()) @@ -181,7 +181,7 @@ def check_cuda(dtype, n, lanes): def test_cuda_make_int8(): def check_cuda(n, value, lanes): dtype = "int8" - ctx = tvm.gpu(0) + dev = tvm.gpu(0) A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype)) s = te.create_schedule(A.op) y, x = s[A].op.axis @@ -189,7 +189,7 @@ def check_cuda(n, value, lanes): s[A].bind(y, bx) fun = tvm.build(s, [A], "cuda", name="make_int8x4") np_a = np.full((n, lanes), value, dtype=dtype) - a = tvm.nd.empty(np_a.shape, dtype, ctx) + a = tvm.nd.empty(np_a.shape, dtype, dev) fun(a) np.testing.assert_equal(a.asnumpy(), np_a) @@ -209,26 +209,26 @@ def check_cuda(n, value, lanes): def test_cuda_inf_nan(): target = "cuda" - def check_inf_nan(ctx, n, value, dtype): + def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tx) fun = tvm.build(s, [A, C], target) - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - check_inf_nan(ctx, 1, -float("inf"), "float32") - check_inf_nan(ctx, 1, -float("inf"), "float64") - check_inf_nan(ctx, 1, float("inf"), "float32") - check_inf_nan(ctx, 1, float("inf"), "float64") - check_inf_nan(ctx, 1, float("nan"), "float32") - check_inf_nan(ctx, 1, float("nan"), "float64") + check_inf_nan(dev, 1, -float("inf"), "float32") + check_inf_nan(dev, 1, -float("inf"), "float64") + check_inf_nan(dev, 1, float("inf"), "float32") + check_inf_nan(dev, 1, float("inf"), "float64") + check_inf_nan(dev, 1, float("nan"), "float32") + check_inf_nan(dev, 1, float("nan"), "float64") @tvm.testing.requires_gpu @@ -285,7 +285,7 @@ def _transform(f, *_): @tvm.testing.parametrize_targets("cuda", "rocm") -def test_crossthread_reduction1(target, ctx): +def test_crossthread_reduction1(target, dev): n = te.var("n") m = te.var("m") A = te.placeholder((n, m), name="A") @@ -307,8 +307,8 @@ def verify(nthd): vals = [nthd - 1, nthd, nthd + 1] for kk in [x for x in vals]: size = (nn, kk) - a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-3) @@ -318,7 +318,7 @@ def verify(nthd): @tvm.testing.parametrize_targets("cuda", "rocm") -def test_crossthread_reduction2(target, ctx): +def test_crossthread_reduction2(target, dev): n = te.var("n") k0 = te.var("k0") k1 = te.var("k1") @@ -345,8 +345,8 @@ def verify(nthdx, nthdy): vy = [nthdy - 1, nthdy, nthdy + 1] for kk0, kk1 in [(x, y) for x in vx for y in vy]: size = (nn, kk0, kk1) - a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) func(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=(1, 2)), rtol=1e-3) @@ -373,7 +373,7 @@ def test_cuda_reduction_binding(): @tvm.testing.parametrize_targets("cuda", "rocm") -def test_rfactor_predicates(target, ctx): +def test_rfactor_predicates(target, dev): n = te.reduce_axis((0, 129), "n") A = te.placeholder((129,), name="A") B = te.compute((1,), lambda b: te.sum(A[n], axis=n), name="B") @@ -419,11 +419,11 @@ def test_cuda_const_float_to_half(): s[c].bind(tx, te.thread_axis("threadIdx.x")) func = tvm.build(s, [a, c], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=shape).astype(a.dtype) c_np = np.zeros(shape=shape, dtype=c.dtype) - a = tvm.nd.array(a_np, ctx) - c = tvm.nd.array(c_np, ctx) + a = tvm.nd.array(a_np, dev) + c = tvm.nd.array(c_np, dev) func(a, c) np.testing.assert_equal(c.asnumpy(), a_np > b.value) @@ -435,7 +435,7 @@ def check(device, dtype, m=32, n=32): if not tvm.testing.device_enabled(device): print("Skipping", device) return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) a = te.placeholder((m, n), name="a", dtype=dtype) b = te.placeholder((m, n), name="b", dtype=dtype) c = a + b @@ -448,9 +448,9 @@ def check(device, dtype, m=32, n=32): a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.random.uniform(size=(m, n)).astype(b.dtype) g_np = np.sum(np.add(a_np * b_np, a_np + b_np)) - a_nd = tvm.nd.array(a_np, ctx) - b_nd = tvm.nd.array(b_np, ctx) - g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + b_nd = tvm.nd.array(b_np, dev) + g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3) @@ -466,8 +466,8 @@ def check(device, dtype, m=32, n=32): if not tvm.testing.device_enabled(device): print("Skipping", device) return - ctx = tvm.context(device, 0) - if dtype == "float16" and not have_fp16(ctx.compute_version): + dev = tvm.device(device, 0) + if dtype == "float16" and not have_fp16(dev.compute_version): print("Skip because gpu does not have fp16 support") return @@ -480,8 +480,8 @@ def check(device, dtype, m=32, n=32): func = tvm.build(sb, [a, b], device) a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.sum(a_np) - a_nd = tvm.nd.array(a_np, ctx) - b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3) @@ -507,11 +507,11 @@ def test_cuda_floordiv_with_vectorization(): s[B].bind(xio, tx) func = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) b_np = np.array([a_np[i // k] for i in range(0, n)]) - a_nd = tvm.nd.array(a_np, ctx) - b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3) @@ -533,11 +533,11 @@ def test_cuda_floormod_with_vectorization(): s[B].bind(xio, tx) func = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) b_np = np.array([a_np[i % k] for i in range(0, n)]) - a_nd = tvm.nd.array(a_np, ctx) - b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3) @@ -564,14 +564,14 @@ def check(t0, t1, factor): func = tvm.build(s, [A, B, C], "cuda") # correctness - ctx = tvm.gpu(0) + dev = tvm.gpu(0) low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10) a_np = np.random.randint(low, high, size=n).astype(A.dtype) b_np = np.random.randint(low, high, size=n).astype(B.dtype) c_np = (a_np + b_np).astype(A.dtype) - a_nd = tvm.nd.array(a_np, ctx) - b_nd = tvm.nd.array(b_np, ctx) - c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + b_nd = tvm.nd.array(b_np, dev) + c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev) func(a_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3) @@ -665,9 +665,9 @@ def run_test(tvm_intrin, np_func, dtype): B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B") s = sched(B) f = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) - a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx) + dev = tvm.gpu(0) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3) @@ -691,9 +691,9 @@ def run_test(tvm_intrin, np_func): B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B") s = sched(B) f = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) - a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx) + dev = tvm.gpu(0) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3) @@ -717,9 +717,9 @@ def run_test(dtype): B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B") s = sched(B) f = tvm.build(s, [A, B], "cuda") - ctx = tvm.gpu(0) - a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx) + dev = tvm.gpu(0) + a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev) f(a, b) ref = np.vectorize(ref_popcount)(a.asnumpy()) tvm.testing.assert_allclose(b.asnumpy(), ref) @@ -736,7 +736,7 @@ def check_cuda(dtype, n, l, padding, lanes): print("Skip because gpu does not have fp16 support") return - ctx = tvm.gpu(0) + dev = tvm.gpu(0) A = tvm.te.placeholder((n, l), name="A", dtype=dtype) B = tvm.te.compute( (n // lanes, l + 2 * padding, lanes), @@ -754,8 +754,8 @@ def check_cuda(dtype, n, l, padding, lanes): s[B].vectorize(vectorize) fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad") np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype) - a = tvm.nd.empty((n, l), A.dtype, ctx).copyfrom(np_a) - b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, ctx) + a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a) + b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev) fun(a, b) np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1) ref = np.pad( @@ -812,10 +812,10 @@ def post_visit(stmt): # To check if every vectorize loop transforms to correct instruction # print(mod.imported_modules[0].get_source()) - ctx = tvm.context("cuda", 0) - a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx) - b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx) - c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), ctx) + dev = tvm.device("cuda", 0) + a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev) + b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev) + c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), dev) mod(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) @@ -957,10 +957,10 @@ def test_unrolled_vectorization(): s[CC].vectorize(j) # Check correctness - ctx = tvm.context(target) - a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), ctx=ctx) - b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), ctx=ctx) - c_tvm = tvm.nd.empty((N, N), ctx=ctx) + dev = tvm.device(target) + a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev) + b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev) + c_tvm = tvm.nd.empty((N, N), device=dev) func_tvm = tvm.build(s, [A, B, C], target=target) func_tvm(a_tvm, b_tvm, c_tvm) c_np = c_tvm.asnumpy() diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py index 3b764c6709a81..b1b14f448b4ef 100644 --- a/tests/python/unittest/test_target_codegen_device.py +++ b/tests/python/unittest/test_target_codegen_device.py @@ -37,10 +37,10 @@ def test_large_uint_imm(): def check_target(device): if not tvm.testing.device_enabled(device): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) f = tvm.build(s, [A], device) # launch the kernel. - a = tvm.nd.empty((n,), dtype=A.dtype, ctx=ctx) + a = tvm.nd.empty((n,), dtype=A.dtype, device=dev) f(a) assert a.asnumpy()[0] == value + 3 @@ -70,14 +70,14 @@ def test_add_pipeline(): def check_target(device, host="stackvm"): if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host): return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) mhost = tvm.driver.build(s, [A, B, D], target=device, target_host=host) f = mhost.entry_func # launch the kernel. n = 1027 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=()).astype(B.dtype), ctx) - d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=()).astype(B.dtype), dev) + d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev) f(a, b, d) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1) diff --git a/tests/python/unittest/test_target_codegen_extern.py b/tests/python/unittest/test_target_codegen_extern.py index 032b1059f5836..2aefee97d649b 100644 --- a/tests/python/unittest/test_target_codegen_extern.py +++ b/tests/python/unittest/test_target_codegen_extern.py @@ -68,11 +68,11 @@ def check_target(target): C = C_gpu if target in ["opencl", "cuda"] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) @@ -102,11 +102,11 @@ def check_target(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()) @@ -133,11 +133,11 @@ def check_target(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) @tvm.register_func def my_extern_array_func2(aa, bb): diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index ec7c5aea333fe..56a8514b30bf0 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -104,9 +104,9 @@ def test_llvm_large_uintimm(): def check_llvm(): f = tvm.build(s, [A], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.empty((), dtype=A.dtype, ctx=ctx) + a = tvm.nd.empty((), dtype=A.dtype, device=dev) f(a) assert a.asnumpy() == value + 3 @@ -132,10 +132,10 @@ def test_llvm_persist_parallel(): def check_llvm(): # BUILD and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5) @@ -154,11 +154,11 @@ def check_llvm(nn, base): s[C].vectorize(xi) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()[::-1][:n]) @@ -184,10 +184,10 @@ def check_llvm(n, lanes): s[B].vectorize(xi) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), C.dtype, ctx) + c = tvm.nd.empty((n,), C.dtype, dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) @@ -207,11 +207,11 @@ def check_llvm(nn, base, stride): s[C].vectorize(xi) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()[base:] + 1) @@ -234,11 +234,11 @@ def test_llvm_temp_space(): def check_llvm(): # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1 + 1) @@ -265,12 +265,12 @@ def check_llvm(): fadd2 = m["fadd2"] fadd1 = m["fadd1"] - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd1(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) fadd2(a, b, c) @@ -287,10 +287,10 @@ def check_llvm(n, offset): s = te.create_schedule(C.op) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + c = tvm.nd.empty((n,), A.dtype, dev) f(a, c) c_np = a.asnumpy() c_np[:offset] = 0 @@ -307,10 +307,10 @@ def check_llvm(n): s = te.create_schedule(C.op) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) - c = tvm.nd.empty((n,), C.dtype, ctx) + a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) + c = tvm.nd.empty((n,), C.dtype, dev) f(a, c) c_np = a.asnumpy() == 1 tvm.testing.assert_allclose(c.asnumpy(), c_np) @@ -329,11 +329,11 @@ def check_llvm(n): s = te.create_schedule(D.op) # build and invoke the kernel. f = tvm.build(s, [A, scale, D], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) - sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) - d = tvm.nd.empty((), D.dtype, ctx) + a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) + sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev) + d = tvm.nd.empty((), D.dtype, dev) f(a, sc, d) d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 tvm.testing.assert_allclose(d.asnumpy(), d_np) @@ -353,11 +353,11 @@ def check_llvm(n): s = te.create_schedule(D.op) # build and invoke the kernel. f = tvm.build(s, [A, scale, D], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) - sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) - d = tvm.nd.empty((), D.dtype, ctx) + a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) + sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev) + d = tvm.nd.empty((), D.dtype, dev) f(a, sc, d) d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 tvm.testing.assert_allclose(d.asnumpy(), d_np) @@ -810,8 +810,8 @@ def do_atomic_add(A): s = tvm.te.create_schedule(C.op) f = tvm.build(s, [A], target="nvptx") - ctx = tvm.gpu() - a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), ctx) + dev = tvm.gpu() + a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), dev) f(a) ref = np.zeros((size,)).astype(A.dtype) ref[0] = size diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py index 8a070da89641f..98340f0e6ac57 100644 --- a/tests/python/unittest/test_target_codegen_opencl.py +++ b/tests/python/unittest/test_target_codegen_opencl.py @@ -24,7 +24,7 @@ @tvm.testing.requires_gpu @tvm.testing.requires_opencl def test_opencl_ternary_expression(): - def check_if_then_else(ctx, n, dtype): + def check_if_then_else(dev, n, dtype): A = te.placeholder((n,), name="A", dtype=dtype) true_value = tvm.tir.const(1, dtype=dtype) false_value = tvm.tir.const(3, dtype=dtype) @@ -35,12 +35,12 @@ def check_if_then_else(ctx, n, dtype): s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - def check_select(ctx, n, dtype): + def check_select(dev, n, dtype): A = te.placeholder((n,), name="A", dtype=dtype) true_value = tvm.tir.const(1, dtype=dtype) false_value = tvm.tir.const(3, dtype=dtype) @@ -51,52 +51,52 @@ def check_select(ctx, n, dtype): s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - check_if_then_else(ctx, 1, "int8") - check_if_then_else(ctx, 1, "uint8") - check_if_then_else(ctx, 1, "int16") - check_if_then_else(ctx, 1, "uint16") - check_select(ctx, 1, "int8") - check_select(ctx, 1, "uint8") - check_select(ctx, 1, "int16") - check_select(ctx, 1, "uint16") + check_if_then_else(dev, 1, "int8") + check_if_then_else(dev, 1, "uint8") + check_if_then_else(dev, 1, "int16") + check_if_then_else(dev, 1, "uint16") + check_select(dev, 1, "int8") + check_select(dev, 1, "uint8") + check_select(dev, 1, "int16") + check_select(dev, 1, "uint16") @tvm.testing.requires_gpu @tvm.testing.requires_opencl def test_opencl_inf_nan(): - def check_inf_nan(ctx, n, value, dtype): + def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - check_inf_nan(ctx, 1, -float("inf"), "float32") - check_inf_nan(ctx, 1, -float("inf"), "float64") - check_inf_nan(ctx, 1, float("inf"), "float32") - check_inf_nan(ctx, 1, float("inf"), "float64") - check_inf_nan(ctx, 1, float("nan"), "float32") - check_inf_nan(ctx, 1, float("nan"), "float64") + check_inf_nan(dev, 1, -float("inf"), "float32") + check_inf_nan(dev, 1, -float("inf"), "float64") + check_inf_nan(dev, 1, float("inf"), "float32") + check_inf_nan(dev, 1, float("inf"), "float64") + check_inf_nan(dev, 1, float("nan"), "float32") + check_inf_nan(dev, 1, float("nan"), "float64") @tvm.testing.requires_gpu @tvm.testing.requires_opencl def test_opencl_max(): - def check_max(ctx, n, dtype): + def check_max(dev, n, dtype): A = te.placeholder((n,), name="A", dtype=dtype) max_lhs = A[0] + tvm.tir.const(1, dtype=dtype) max_rhs = tvm.tir.const(0, dtype=dtype) @@ -105,19 +105,19 @@ def check_max(ctx, n, dtype): s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) - check_max(ctx, 1, "int8") - check_max(ctx, 1, "uint8") - check_max(ctx, 1, "int16") - check_max(ctx, 1, "uint16") - check_max(ctx, 1, "float32") - check_max(ctx, 1, "float64") + check_max(dev, 1, "int8") + check_max(dev, 1, "uint8") + check_max(dev, 1, "int16") + check_max(dev, 1, "uint16") + check_max(dev, 1, "float32") + check_max(dev, 1, "float64") if __name__ == "__main__": diff --git a/tests/python/unittest/test_target_codegen_rocm.py b/tests/python/unittest/test_target_codegen_rocm.py index 36a659b07f9b9..9eb0b5cf938d5 100644 --- a/tests/python/unittest/test_target_codegen_rocm.py +++ b/tests/python/unittest/test_target_codegen_rocm.py @@ -46,35 +46,35 @@ def test_rocm_cross_thread_reduction(): frocm = tvm.build(s, [A, B], "rocm") nn = 128 - ctx = tvm.rocm(0) - a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) + dev = tvm.rocm(0) + a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) frocm(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4) @tvm.testing.requires_rocm def test_rocm_inf_nan(): - def check_inf_nan(ctx, n, value, dtype): + def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tx) fun = tvm.build(s, [A, C], "rocm") - a = tvm.nd.empty((n,), A.dtype, ctx) - c = tvm.nd.empty((n,), A.dtype, ctx) + a = tvm.nd.empty((n,), A.dtype, dev) + c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) - ctx = tvm.rocm(0) + dev = tvm.rocm(0) - check_inf_nan(ctx, 1, -float("inf"), "float32") - check_inf_nan(ctx, 1, -float("inf"), "float64") - check_inf_nan(ctx, 1, float("inf"), "float32") - check_inf_nan(ctx, 1, float("inf"), "float64") - check_inf_nan(ctx, 1, float("nan"), "float32") - check_inf_nan(ctx, 1, float("nan"), "float64") + check_inf_nan(dev, 1, -float("inf"), "float32") + check_inf_nan(dev, 1, -float("inf"), "float64") + check_inf_nan(dev, 1, float("inf"), "float32") + check_inf_nan(dev, 1, float("inf"), "float64") + check_inf_nan(dev, 1, float("nan"), "float32") + check_inf_nan(dev, 1, float("nan"), "float64") @tvm.testing.requires_rocm @@ -94,9 +94,9 @@ def test_rocm_reduction_binding(): def test_rocm_copy(): def check_rocm(dtype, n): A = te.placeholder((n,), name="A", dtype=dtype) - ctx = tvm.rocm(0) + dev = tvm.rocm(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(a_np) b_np = a.asnumpy() tvm.testing.assert_allclose(a_np, b_np) tvm.testing.assert_allclose(a_np, a.asnumpy()) @@ -120,9 +120,9 @@ def check_rocm(dtype, n, lanes): s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "rocm") - ctx = tvm.rocm(0) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), B.dtype, ctx) + dev = tvm.rocm(0) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) + c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) diff --git a/tests/python/unittest/test_target_codegen_spirv.py b/tests/python/unittest/test_target_codegen_spirv.py index bf47bbe2af44f..df42eeb721ab5 100644 --- a/tests/python/unittest/test_target_codegen_spirv.py +++ b/tests/python/unittest/test_target_codegen_spirv.py @@ -62,11 +62,11 @@ def do_copy(A, B, n): with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [A, B], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) a_np = np.random.uniform(size=n) > 0.5 b_np = np.zeros((n,), dtype="int32") - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) func(a, b) ref = a_np.astype(np.int32) tvm.testing.assert_allclose(b.asnumpy(), ref) @@ -74,8 +74,8 @@ def do_copy(A, B, n): def check_mod(mod, x_np, res_np): target = "vulkan" - ctx = tvm.context(target, 0) - ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + dev = tvm.device(target, 0) + ex = relay.create_executor("vm", mod=mod, device=dev, target=target) res = ex.evaluate()(x_np).asnumpy() tvm.testing.assert_allclose(res, res_np, atol=1e-5) diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py index d53045613bee9..e68996df531f6 100644 --- a/tests/python/unittest/test_target_codegen_vulkan.py +++ b/tests/python/unittest/test_target_codegen_vulkan.py @@ -65,9 +65,9 @@ def check_correct_assembly(dtype): def test_vulkan_copy(): def check_vulkan(dtype, n): A = te.placeholder((n,), name="A", dtype=dtype) - ctx = tvm.vulkan(0) + dev = tvm.vulkan(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(a_np) b_np = a.asnumpy() tvm.testing.assert_allclose(a_np, b_np) tvm.testing.assert_allclose(a_np, a.asnumpy()) @@ -91,9 +91,9 @@ def check_vulkan(dtype, n, lanes): s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "vulkan") - ctx = tvm.vulkan(0) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), B.dtype, ctx) + dev = tvm.vulkan(0) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) + c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) @@ -139,10 +139,10 @@ def build_f(f_ref): fs = [ build_f(random.choice(functions)) for _ in range(np.random.randint(low=1, high=10)) ] - ctx = tvm.vulkan(0) - a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n,))) - b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np.random.uniform(size=(n,))) - cs = [tvm.nd.empty((n,), A.dtype, ctx) for _ in fs] + dev = tvm.vulkan(0) + a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,))) + b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,))) + cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs] for ((f, _), c) in zip(fs, cs): f(a, b, c) diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py index b2f26471d2677..59b20bd11e751 100644 --- a/tests/python/unittest/test_te_autodiff.py +++ b/tests/python/unittest/test_te_autodiff.py @@ -31,7 +31,7 @@ def check_grad( inputs = inputs if isinstance(inputs, list) else [inputs] def check_device(device, host="llvm"): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(host): return diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py index be9956529dcce..f3091c7b71a6c 100644 --- a/tests/python/unittest/test_te_hybrid_script.py +++ b/tests/python/unittest/test_te_hybrid_script.py @@ -32,7 +32,7 @@ def tvm_val_2_py_val(val): assert isinstance(val, (tvm.tir.IntImm,)) return val.value - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) op = None if sch is None: @@ -50,7 +50,7 @@ def tvm_val_2_py_val(val): if isinstance(i, te.tensor.Tensor): shape = [tvm_val_2_py_val(j) for j in i.shape] emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) - nd_args.append(tvm.nd.array(emu_args[-1], ctx)) + nd_args.append(tvm.nd.array(emu_args[-1], dev)) elif isinstance(i, tvm.tir.Var): emu_args.append(tvm_val_2_py_val(i)) nd_args.append(emu_args[-1]) @@ -68,7 +68,7 @@ def tvm_val_2_py_val(val): for i in range(op.num_outputs): output = op.output(i) shape = [tvm_val_2_py_val(j) for j in output.shape] - nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx)) + nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), dev)) out_tensors.append(nd_args[-1]) ref_data = func(*emu_args) diff --git a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py index 88cf66e6f03ba..e7a8469a8311d 100644 --- a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py +++ b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py @@ -100,15 +100,15 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): func = tvm.build(s, [A, B, C], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(l, m)).astype(B.dtype) c_np = np.zeros((n, m), dtype=np.float32) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, ctx, number=3) + evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3)) c_np = np.dot(a_np, b_np) @@ -195,15 +195,15 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2): func = tvm.build(s, [A, B, C], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype) b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype) c_np = np.zeros((batch, n, m), dtype=np.float32) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), dev) func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, ctx, number=3) + evaluator = func.time_evaluator(func.entry_name, dev, number=3) print( "batch gemm m=%d n=%d k=%d batch=%d: %f ms" % (m, n, l, batch, evaluator(a, b, c).mean * 1e3) diff --git a/tests/python/unittest/test_te_schedule_tensor_core.py b/tests/python/unittest/test_te_schedule_tensor_core.py index 01da1a1a0db20..9491425b38669 100644 --- a/tests/python/unittest/test_te_schedule_tensor_core.py +++ b/tests/python/unittest/test_te_schedule_tensor_core.py @@ -256,14 +256,14 @@ def test_tensor_core_batch_matmal(): func = tvm.build(s, [A, B, C], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev) func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, ctx, number=3) + evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) if VERIFY: @@ -432,13 +432,13 @@ def test_tensor_core_batch_conv(): func = tvm.build(s, [A, W, Conv], "cuda") - ctx = tvm.gpu(0) + dev = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) - evaluator = func.time_evaluator(func.entry_name, ctx, number=3) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) + evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) if VERIFY: diff --git a/tests/python/unittest/test_te_tensor_overload.py b/tests/python/unittest/test_te_tensor_overload.py index 3126d8484e357..33dc19a19be99 100644 --- a/tests/python/unittest/test_te_tensor_overload.py +++ b/tests/python/unittest/test_te_tensor_overload.py @@ -77,12 +77,12 @@ def test_combination(): D = k + A - B * C + x s = te.create_schedule(D.op) foo = tvm.build(s, [x, A, B, C, D], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) x = 2 - a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx) - d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), dev) + d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) foo(x, a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() + x) @@ -107,7 +107,7 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_elemwise_schedule(device)(B) @@ -126,8 +126,8 @@ def check_device(device): else: raise NotImplementedError() - a_nd = tvm.nd.array(a_npy, ctx) - b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) + a_nd = tvm.nd.array(a_npy, dev) + b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev) foo(a_nd, b_nd, k_, *shape) tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5) @@ -150,7 +150,7 @@ def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"): raise NotImplementedError() def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -173,9 +173,9 @@ def check_device(device): else: raise NotImplementedError() - lhs_nd = tvm.nd.array(lhs_npy, ctx) - rhs_nd = tvm.nd.array(rhs_npy, ctx) - out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) + lhs_nd = tvm.nd.array(lhs_npy, dev) + rhs_nd = tvm.nd.array(rhs_npy, dev) + out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4) @@ -189,7 +189,7 @@ def verify_conv2d_scalar_bop( batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add" ): def check_device(device): - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return @@ -232,10 +232,10 @@ def check_device(device): else: raise NotImplementedError() - a_nd = tvm.nd.array(a_npy, ctx) - w_nd = tvm.nd.array(w_npy, ctx) - b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) - c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx) + a_nd = tvm.nd.array(a_npy, dev) + w_nd = tvm.nd.array(w_npy, dev) + b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev) + c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), dev) foo(a_nd, w_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1e-4, atol=1e-4) diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py index fcbab046dab3d..de03cddfb50c0 100644 --- a/tests/python/unittest/test_tir_buffer.py +++ b/tests/python/unittest/test_tir_buffer.py @@ -149,10 +149,10 @@ def test_buffer_broadcast(): def check(): fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={A: Ab, B: Bb}) - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -178,10 +178,10 @@ def check_stride(): fadd = tvm.build( s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} ) - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -189,20 +189,20 @@ def check_no_stride(): fadd = tvm.build( s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} ) - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) def check_auto_bind(): # Let build bind buffers fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add") - ctx = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx) + dev = tvm.cpu(0) + a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) fadd(a, b, c, 4, 1) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py index 76390dace7577..755ffdf213b31 100644 --- a/tests/python/unittest/test_tir_intrin.py +++ b/tests/python/unittest/test_tir_intrin.py @@ -32,10 +32,10 @@ def test_nearbyint(): A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name="A") s = te.create_schedule(A_rounded.op) f = tvm.build(s, [A, A_rounded], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = 10 - a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), ctx) - a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), ctx) + a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev) + a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev) f(a, a_rounded) # Note that numpys rint rounds to nearest integer with # ties to halfway is broken by rounding to even. @@ -80,10 +80,10 @@ def run_test(tvm_intrin, np_func): B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B") s = te.create_schedule(B.op) f = tvm.build(s, [A, B], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = 10 - a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-5, rtol=1e-5) @@ -108,11 +108,11 @@ def run_test(tvm_intrin, np_func): C = te.compute((m,), lambda *i: tvm_intrin(A(*i), B(*i)), name="C") s = te.create_schedule(C.op) f = tvm.build(s, [A, B, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = 10 - a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np_func(a.asnumpy(), b.asnumpy()), atol=1e-5, rtol=1e-5 @@ -131,11 +131,11 @@ def test_ldexp(): C = te.compute((m,), lambda *i: tvm.tir.ldexp(A(*i), B(*i)), name="C") s = te.create_schedule(C.op) f = tvm.build(s, [A, B, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = 10 - a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.ldexp(a.asnumpy(), b.asnumpy()), atol=1e-5, rtol=1e-5 diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py index 8ad5cb63924ed..0b05c1093bc68 100644 --- a/tests/python/unittest/test_tir_ir_builder.py +++ b/tests/python/unittest/test_tir_ir_builder.py @@ -108,11 +108,11 @@ def check_target(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -161,11 +161,11 @@ def check_target(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -215,12 +215,12 @@ def check_target(target, ir): with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [A, B, C], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) a_np = np.random.uniform(size=n).astype(A.dtype) b_np = np.random.uniform(size=n).astype(B.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) func(a, b, c) ref = num_iter * (a_np + b_np) tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5) @@ -283,8 +283,8 @@ def check_target(target, ir): with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [C], target) - ctx = tvm.context(target, 0) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + dev = tvm.device(target, 0) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) func(c) ref = np.array([collatz_ref(i) for i in range(n)]) tvm.testing.assert_allclose(c.asnumpy(), ref) @@ -397,8 +397,8 @@ def check_target(target, ir): with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [C], target) - ctx = tvm.context(target, 0) - c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + dev = tvm.device(target, 0) + c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev) func(c) tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5) @@ -480,13 +480,13 @@ def check_target(target, ir): with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [A, B, C], target) - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) a_np = np.random.uniform(size=n).astype(A.dtype) b_np = np.random.uniform(size=n).astype(B.dtype) a_np = np.sort(a_np) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) func(a, b, c) ref = np.searchsorted(a_np, b_np) tvm.testing.assert_allclose(c.asnumpy(), ref) diff --git a/tests/python/unittest/test_tir_transform_hoist_if.py b/tests/python/unittest/test_tir_transform_hoist_if.py index 77ab38d0490d1..748a33d977ee4 100644 --- a/tests/python/unittest/test_tir_transform_hoist_if.py +++ b/tests/python/unittest/test_tir_transform_hoist_if.py @@ -762,15 +762,15 @@ def test_hoisting_op_conv(): kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype) params = {"w": tvm.nd.array(kernel)} - for target, ctx in enabled_targets(): + for target, dev in enabled_targets(): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx)) + m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) x = np.random.uniform(size=dshape) data_tvm = tvm.nd.array(data) m.set_input("x", data_tvm) m.run() - e = m.module.time_evaluator("run", ctx, number=300, repeat=3) + e = m.module.time_evaluator("run", dev, number=300, repeat=3) t1 = e(data_tvm).results t1 = np.array(t1) * 1000 print("{} ms".format(t1.mean())) @@ -779,13 +779,13 @@ def test_hoisting_op_conv(): opt_level=3, config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}} ): lib = relay.build_module.build(mod, target=target, params=params) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx)) + m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) x = np.random.uniform(size=dshape) data_tvm = tvm.nd.array(data) m.set_input("x", data_tvm) m.set_input(**params) m.run() - e = m.module.time_evaluator("run", ctx, number=300, repeat=3) + e = m.module.time_evaluator("run", dev, number=300, repeat=3) t2 = e(data_tvm).results t2 = np.array(t2) * 1000 diff --git a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py index 187013e132a43..e0cba8421e837 100644 --- a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py +++ b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py @@ -40,10 +40,10 @@ def test_out_of_bounds_llvm(index_a, index_b): stmt = tvm.lower(s, [A, B, C], simple_mode=True) print(stmt) fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") - ctx = tvm.context(tgt, 0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx) + dev = tvm.device(tgt, 0) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev) fadd(a, b, c) @@ -58,10 +58,10 @@ def test_in_bounds_llvm(): tgt_host = "llvm" stmt = tvm.lower(s, [A, B, C], simple_mode=True) fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") - ctx = tvm.context(tgt, 0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx) + dev = tvm.device(tgt, 0) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev) fadd(a, b, c) @@ -80,11 +80,11 @@ def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b): tgt_host = "llvm" stmt = tvm.lower(s, [a, b, c], simple_mode=True) f = tvm.build(s, [a, b, c], tgt, target_host=tgt_host, name="myaddvec") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) n = nn - a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=c.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=c.dtype), dev) f(a, b, c) @@ -106,10 +106,10 @@ def test_in_bounds_vectorize_llvm(): # build and invoke the kernel. lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False) f = tvm.build(s, [A, C], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), C.dtype, ctx) + c = tvm.nd.empty((n,), C.dtype, dev) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) @@ -124,12 +124,12 @@ def test_in_bounds_loop_partition_basic_llvm(): s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx) - t = tvm.nd.empty((32,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) + t = tvm.nd.empty((32,), T.dtype, dev) f(a, b, t) @@ -144,12 +144,12 @@ def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b): s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx) - t = tvm.nd.empty((32,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) + t = tvm.nd.empty((32,), T.dtype, dev) f(a, b, t) @@ -221,12 +221,12 @@ def test_in_bounds_const_loop_partition_llvm(): s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx) - t = tvm.nd.empty((n,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) + t = tvm.nd.empty((n,), T.dtype, dev) f(a, b, t) @@ -247,12 +247,12 @@ def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b): s = te.create_schedule(T.op) xo, xi = s[T].split(T.op.axis[0], factor=4) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx) - t = tvm.nd.empty((n,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) + t = tvm.nd.empty((n,), T.dtype, dev) f(a, b, t) @@ -283,19 +283,19 @@ def test_in_bounds_conv_llvm(loop_tiling=False): if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array( - np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx + np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev ) kernel_input = tvm.nd.array( np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype( "float32" ), - ctx, + dev, ) - conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", ctx) + conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev) f(data_input, kernel_input, conv_out) @@ -339,19 +339,19 @@ def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array( - np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx + np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev ) kernel_input = tvm.nd.array( np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype( "float32" ), - ctx, + dev, ) - conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", ctx) + conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev) f(data_input, kernel_input, conv_out) @@ -366,12 +366,12 @@ def test_in_bounds_tensors_with_same_shapes1D_llvm(): T = te.compute((m,), lambda i: A[i] * B[i]) s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx) - t = tvm.nd.empty((32,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) + t = tvm.nd.empty((32,), T.dtype, dev) f(a, b, t) @@ -387,12 +387,12 @@ def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape T = te.compute((m,), lambda i: A[i] * B[i]) s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), ctx) - t = tvm.nd.empty((c_shape,), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), dev) + t = tvm.nd.empty((c_shape,), T.dtype, dev) f(a, b, t) @@ -407,12 +407,12 @@ def test_in_bounds_tensors_with_same_shapes2D_llvm(): T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j]) s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), ctx) - t = tvm.nd.empty((32, 32), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), dev) + t = tvm.nd.empty((32, 32), T.dtype, dev) f(a, b, t) @@ -428,12 +428,12 @@ def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j]) s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), ctx) - t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), dev) + t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, dev) f(a, b, t) @@ -449,12 +449,12 @@ def test_in_bounds_tensors_with_same_shapes3D_llvm(): s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), ctx) - t = tvm.nd.empty((32, 32, 32), T.dtype, ctx) + a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), dev) + t = tvm.nd.empty((32, 32, 32), T.dtype, dev) f(a, b, t) @@ -471,16 +471,16 @@ def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape s = te.create_schedule(T.op) lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - ctx = tvm.cpu(0) + dev = tvm.cpu(0) f = tvm.build(s, [A, B, T], "llvm") a = tvm.nd.array( - np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), ctx + np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), dev ) b = tvm.nd.array( - np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), ctx + np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), dev ) - t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, ctx) + t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, dev) f(a, b, t) @@ -498,11 +498,11 @@ def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm(): # build and invoke the kernel. f = tvm.build(s, [A, scale, D], "llvm") - ctx = tvm.cpu(0) + dev = tvm.cpu(0) # launch the kernel. - a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx) - sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx) - d = tvm.nd.empty((), D.dtype, ctx) + a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) + sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev) + d = tvm.nd.empty((), D.dtype, dev) f(a, sc, d) d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1 tvm.testing.assert_allclose(d.asnumpy(), d_np) diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py index ecaff319441d5..f5a5e4ca65632 100644 --- a/tests/python/unittest/test_tir_transform_loop_partition.py +++ b/tests/python/unittest/test_tir_transform_loop_partition.py @@ -487,24 +487,24 @@ def test_double_splitting_with_indivisible_factors(): assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse))) # check functional correctness of generated code - ctx = tvm.context(target, 0) + dev = tvm.device(target, 0) a = tvm.nd.array( numpy.ones( m, ).astype(dtype), - ctx, + dev, ) c = tvm.nd.array( numpy.zeros( m, ).astype(dtype), - ctx, + dev, ) d = tvm.nd.array( numpy.zeros( m, ).astype(dtype), - ctx, + dev, ) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) diff --git a/tests/python/unittest/test_tir_transform_lower_warp_memory.py b/tests/python/unittest/test_tir_transform_lower_warp_memory.py index 28179c2d7b400..ac7204368c1d0 100644 --- a/tests/python/unittest/test_tir_transform_lower_warp_memory.py +++ b/tests/python/unittest/test_tir_transform_lower_warp_memory.py @@ -114,7 +114,7 @@ def check_cuda(dtype): xo, xi = s[AA].split(s[AA].op.axis[0], 32) s[AA].bind(xi, tx) - ctx = tvm.gpu(0) + dev = tvm.gpu(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array(list(range(m)), dtype=dtype) B_np = np.array( @@ -128,8 +128,8 @@ def check_cuda(dtype): + [96], dtype=dtype, ) - A_nd = tvm.nd.array(A_np, ctx) - B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx) + A_nd = tvm.nd.array(A_np, dev) + B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3) @@ -181,12 +181,12 @@ def check_cuda(dtype): _, x = AA.op.axis s[AA].bind(x, tx) - ctx = tvm.gpu(0) + dev = tvm.gpu(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype) B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype) - A_nd = tvm.nd.array(A_np, ctx) - B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx) + A_nd = tvm.nd.array(A_np, dev) + B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3) @@ -228,13 +228,13 @@ def check_cuda(dtype): s[BB].bind(xo, bx) s[BB].bind(xi, tx) - ctx = tvm.gpu(0) + dev = tvm.gpu(0) func = tvm.build(s, [A, B, C], "cuda") AB_np = np.array(list(range(m)), dtype=dtype) C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2 - A_nd = tvm.nd.array(AB_np, ctx) - B_nd = tvm.nd.array(AB_np, ctx) - C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), ctx) + A_nd = tvm.nd.array(AB_np, dev) + B_nd = tvm.nd.array(AB_np, dev) + C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev) func(A_nd, B_nd, C_nd) tvm.testing.assert_allclose(C_nd.asnumpy(), C_np, rtol=1e-3) @@ -260,12 +260,12 @@ def check(device, m): s[AA].bind(yi, tx) s[AA].compute_at(s[B], xo) - ctx = tvm.context(device, 0) + dev = tvm.device(device, 0) func = tvm.build(s, [A, B], device) A_np = np.random.uniform(size=(m,)).astype(A.dtype) B_np = np.zeros(shape=(m,)).astype(B.dtype) - A_nd = tvm.nd.array(A_np, ctx) - B_nd = tvm.nd.array(B_np, ctx) + A_nd = tvm.nd.array(A_np, dev) + B_nd = tvm.nd.array(B_np, dev) func(A_nd, B_nd) B_np = A_np + 1 tvm.testing.assert_allclose(B_nd.asnumpy(), B_np) diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py index 396bdb0d55aa8..41fdcbbdbc822 100644 --- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py +++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py @@ -145,18 +145,18 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding): conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) out_np = np.maximum(conv_np + bias_np, 0.0) -ctx = tvm.gpu() -data_tvm = tvm.nd.array(data_np, ctx=ctx) -weight_tvm = tvm.nd.array(weight_np, ctx=ctx) -bias_tvm = tvm.nd.array(bias_np, ctx=ctx) -out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) +dev = tvm.gpu() +data_tvm = tvm.nd.array(data_np, device=dev) +weight_tvm = tvm.nd.array(weight_np, device=dev) +bias_tvm = tvm.nd.array(bias_np, device=dev) +out_tvm = tvm.nd.empty(out_np.shape, device=dev) func(data_tvm, weight_tvm, bias_tvm, out_tvm) # Check results np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) # Evaluate execution time -evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) print( "Execution time of this operator: %.3f ms" % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000) diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py index c4add79450e90..30bbce1e6b7e0 100644 --- a/tutorials/auto_scheduler/tune_network_arm.py +++ b/tutorials/auto_scheduler/tune_network_arm.py @@ -320,14 +320,14 @@ def tune_and_evaluate(): rlib = remote.load_module(filename) # Create graph runtime - ctx = remote.cpu() - module = graph_runtime.GraphModule(rlib["default"](ctx)) + dev = remote.cpu() + module = graph_runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) + ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index bc88457f94f90..b5162fc85f148 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -281,14 +281,14 @@ def run_tuning(): lib = relay.build(mod, target=target, params=params) # Create graph runtime -ctx = tvm.context(str(target), 0) -module = graph_runtime.GraphModule(lib["default"](ctx)) +dev = tvm.device(str(target), 0) +module = graph_runtime.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") -ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) +ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index 2bce968771e32..3d3861263e4b7 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -182,14 +182,14 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # # from tvm.auto_scheduler.utils import request_remote # remote = request_remote(device_key, "0.0.0.0", 9190) -# ctx = remote.cl() -# max_shared_memory_per_block = ctx.max_shared_memory_per_block +# dev = remote.cl() +# max_shared_memory_per_block = dev.max_shared_memory_per_block # # There is no explicit local memory limition # # so we can use INT32_MAX to disalbe the check on local_memory. # max_local_memory_per_block = 2147483647 # INT32_MAX -# max_threads_per_block = ctx.max_threads_per_block -# max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size -# warp_size = ctx.warp_size +# max_threads_per_block = dev.max_threads_per_block +# max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size +# warp_size = dev.warp_size # hardware_params = auto_scheduler.HardwareParams(-1, 16, 64, # max_shared_memory_per_block, max_local_memory_per_block, # max_threads_per_block, max_vthread_extent, warp_size) @@ -247,7 +247,7 @@ def tune_and_evaluate(): from tvm.auto_scheduler.utils import request_remote remote = request_remote(device_key, "0.0.0.0", 9190) - ctx = remote.cl() + dev = remote.cl() from tvm.contrib import utils, ndk temp = utils.tempdir() @@ -256,14 +256,14 @@ def tune_and_evaluate(): lib.export_library(path_lib, ndk.create_shared) remote.upload(path_lib) loaded_lib = remote.load_module(filename) - module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + module = graph_runtime.GraphModule(loaded_lib["default"](dev)) data = (np.random.uniform(size=input_shape)).astype(dtype) data_tvm = tvm.nd.array(data) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) + ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index 2b47c64729e0f..ca2b8c1977788 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -280,14 +280,14 @@ def run_tuning(): lib = relay.build(mod, target=target, params=params) # Create graph runtime -ctx = tvm.context(str(target), 0) -module = graph_runtime.GraphModule(lib["default"](ctx)) +dev = tvm.device(str(target), 0) +module = graph_runtime.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") -ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) +ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/tutorials/auto_scheduler/tune_sparse_x86.py index ced416f6c5009..3df372048a145 100644 --- a/tutorials/auto_scheduler/tune_sparse_x86.py +++ b/tutorials/auto_scheduler/tune_sparse_x86.py @@ -274,14 +274,14 @@ def apply_func(search_policy, state, stage_id): func = tvm.build(sch, args, target) -ctx = tvm.cpu() +dev = tvm.cpu() -X_tvm = tvm.nd.array(X_np, ctx=ctx) -W_data_tvm = tvm.nd.array(W_sp_np.data, ctx=ctx) -W_indices_tvm = tvm.nd.array(W_sp_np.indices, ctx=ctx) -W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, ctx=ctx) -B_tvm = tvm.nd.array(B_np, ctx=ctx) -Y_tvm = tvm.nd.empty(Y_np.shape, ctx=ctx) +X_tvm = tvm.nd.array(X_np, device=dev) +W_data_tvm = tvm.nd.array(W_sp_np.data, device=dev) +W_indices_tvm = tvm.nd.array(W_sp_np.indices, device=dev) +W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, device=dev) +B_tvm = tvm.nd.array(B_np, device=dev) +Y_tvm = tvm.nd.empty(Y_np.shape, device=dev) func(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm) @@ -289,7 +289,7 @@ def apply_func(search_policy, state, stage_id): tvm.testing.assert_allclose(Y_np, Y_tvm.asnumpy(), atol=1e-4, rtol=1e-4) # Evaluate execution time. -evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) print( "Execution time of this operator: %.3f ms" % ( diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index dc8e6e522249d..d14f9c33320c9 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -230,15 +230,15 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) -ctx = tvm.gpu() -a_tvm = tvm.nd.array(a_np, ctx=ctx) -w_tvm = tvm.nd.array(w_np, ctx=ctx) -c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) +dev = tvm.gpu() +a_tvm = tvm.nd.array(a_np, device=dev) +w_tvm = tvm.nd.array(w_np, device=dev) +c_tvm = tvm.nd.empty(c_np.shape, device=dev) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. -evaluator = func.time_evaluator(func.entry_name, ctx, number=400) +evaluator = func.time_evaluator(func.entry_name, dev, number=400) print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean) diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py index 2b389235e4fea..9ddcf817c1683 100644 --- a/tutorials/autotvm/tune_relay_arm.py +++ b/tutorials/autotvm/tune_relay_arm.py @@ -348,14 +348,14 @@ def tune_and_evaluate(tuning_opt): rlib = remote.load_module(filename) # upload parameters to device - ctx = remote.context(str(target), 0) - module = runtime.GraphModule(rlib["default"](ctx)) + dev = remote.device(str(target), 0) + module = runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=10) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py index 148ebbf43dc13..8ae5144eb31db 100644 --- a/tutorials/autotvm/tune_relay_cuda.py +++ b/tutorials/autotvm/tune_relay_cuda.py @@ -237,14 +237,14 @@ def tune_and_evaluate(tuning_opt): lib = relay.build_module.build(mod, target=target, params=params) # load parameters - ctx = tvm.context(str(target), 0) - module = runtime.GraphModule(lib["default"](ctx)) + dev = tvm.device(str(target), 0) + module = runtime.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py index 859ac583236ee..08fc87d4da1b3 100644 --- a/tutorials/autotvm/tune_relay_mobile_gpu.py +++ b/tutorials/autotvm/tune_relay_mobile_gpu.py @@ -351,14 +351,14 @@ def tune_and_evaluate(tuning_opt): rlib = remote.load_module(filename) # upload parameters to device - ctx = remote.context(str(target), 0) - module = runtime.GraphModule(rlib["default"](ctx)) + dev = remote.device(str(target), 0) + module = runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30) + ftimer = module.module.time_evaluator("run", dev, number=1, repeat=30) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py index 30e62efe0d9dd..b072a3adfea33 100644 --- a/tutorials/autotvm/tune_relay_x86.py +++ b/tutorials/autotvm/tune_relay_x86.py @@ -213,14 +213,14 @@ def tune_and_evaluate(tuning_opt): lib = relay.build_module.build(mod, target=target, params=params) # upload parameters to device - ctx = tvm.cpu() + dev = tvm.cpu() data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) - module = runtime.GraphModule(lib["default"](ctx)) + module = runtime.GraphModule(lib["default"](dev)) module.set_input(input_name, data_tvm) # evaluate print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3) + ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py index b832d18f9c3a9..b21c09692a64d 100644 --- a/tutorials/frontend/build_gcn.py +++ b/tutorials/frontend/build_gcn.py @@ -336,8 +336,8 @@ def prepare_params(g, data): lib = relay.build(mod, target, params=params) # Generate graph runtime -ctx = tvm.context(target, 0) -m = graph_runtime.GraphModule(lib["default"](ctx)) +dev = tvm.device(target, 0) +m = graph_runtime.GraphModule(lib["default"](dev)) ###################################################################### # Run the TVM model, test for accuracy and verify with DGL diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py index ff7ef44a7acba..c69ac8d3f0e3a 100644 --- a/tutorials/frontend/deploy_model_on_android.py +++ b/tutorials/frontend/deploy_model_on_android.py @@ -305,20 +305,20 @@ def transform_image(image): remote = tracker.request(key, priority=0, session_timeout=60) if local_demo: - ctx = remote.cpu(0) + dev = remote.cpu(0) elif test_target == "opencl": - ctx = remote.cl(0) + dev = remote.cl(0) elif test_target == "vulkan": - ctx = remote.vulkan(0) + dev = remote.vulkan(0) else: - ctx = remote.cpu(0) + dev = remote.cpu(0) # upload the library to remote device and load it remote.upload(lib_fname) rlib = remote.load_module("net.so") # create the remote runtime module -module = runtime.GraphModule(rlib["default"](ctx)) +module = runtime.GraphModule(rlib["default"](dev)) ###################################################################### # Execute on TVM @@ -336,7 +336,7 @@ def transform_image(image): print("TVM prediction top-1: {}".format(synset[top1])) print("Evaluate inference time cost...") -ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10) +ftimer = module.module.time_evaluator("run", dev, number=1, repeat=10) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py index cae9d905898bb..75e142243a387 100644 --- a/tutorials/frontend/deploy_model_on_rasp.py +++ b/tutorials/frontend/deploy_model_on_rasp.py @@ -217,8 +217,8 @@ def transform_image(image): rlib = remote.load_module("net.tar") # create the remote runtime module -ctx = remote.cpu(0) -module = runtime.GraphModule(rlib["default"](ctx)) +dev = remote.cpu(0) +module = runtime.GraphModule(rlib["default"](dev)) # set input data module.set_input("data", tvm.nd.array(x.astype("float32"))) # run diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/tutorials/frontend/deploy_object_detection_pytorch.py index 2852dd3ad99d3..6b88b617be18b 100644 --- a/tutorials/frontend/deploy_object_detection_pytorch.py +++ b/tutorials/frontend/deploy_object_detection_pytorch.py @@ -134,8 +134,8 @@ def forward(self, inp): ###################################################################### # Inference with Relay VM # ----------------------- -ctx = tvm.cpu() -vm = VirtualMachine(vm_exec, ctx) +dev = tvm.cpu() +vm = VirtualMachine(vm_exec, dev) vm.set_input("main", **{input_name: img}) tvm_res = vm.run() diff --git a/tutorials/frontend/deploy_prequantized.py b/tutorials/frontend/deploy_prequantized.py index beba332a8a26b..681bc07413381 100644 --- a/tutorials/frontend/deploy_prequantized.py +++ b/tutorials/frontend/deploy_prequantized.py @@ -90,7 +90,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) - runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.context(target, 0))) + runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.device(target, 0))) runtime.set_input(input_name, inp) runtime.run() @@ -198,8 +198,8 @@ def quantize_model(model, inp): # ------------------------- # Here we give an example of how to measure performance of TVM compiled models. n_repeat = 100 # should be bigger to make the measurement more accurate -ctx = tvm.cpu(0) -ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat) +dev = tvm.cpu(0) +ftimer = rt_mod.module.time_evaluator("run", dev, number=1, repeat=n_repeat) prof_res = np.array(ftimer().results) * 1e3 print("Elapsed average ms:", np.mean(prof_res)) diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/tutorials/frontend/deploy_prequantized_tflite.py index 121ad9dda1514..547704a72fc44 100644 --- a/tutorials/frontend/deploy_prequantized_tflite.py +++ b/tutorials/frontend/deploy_prequantized_tflite.py @@ -231,8 +231,8 @@ def run_tvm(lib): # ------------------- # Here we give an example of how to measure performance of TVM compiled models. n_repeat = 100 # should be bigger to make the measurement more accurate -ctx = tvm.cpu(0) -ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat) +dev = tvm.cpu(0) +ftimer = rt_mod.module.time_evaluator("run", dev, number=1, repeat=n_repeat) prof_res = np.array(ftimer().results) * 1e3 print("Elapsed average ms:", np.mean(prof_res)) diff --git a/tutorials/frontend/deploy_quantized.py b/tutorials/frontend/deploy_quantized.py index e75f6e92a6f1d..b2210b8ab69b1 100644 --- a/tutorials/frontend/deploy_quantized.py +++ b/tutorials/frontend/deploy_quantized.py @@ -39,7 +39,7 @@ batch_size = 1 model_name = "resnet18_v1" target = "cuda" -ctx = tvm.context(target) +dev = tvm.device(target) ############################################################################### # Prepare the Dataset @@ -146,7 +146,7 @@ def quantize(mod, params, data_aware): # ------------- # We create a Relay VM to build and execute the model. def run_inference(mod): - executor = relay.create_executor("vm", mod, ctx, target) + executor = relay.create_executor("vm", mod, dev, target) val_data, batch_fn = get_val_data() for i, batch in enumerate(val_data): data, label = batch_fn(batch) diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py index 98004a93c74fe..4d96f4d5de9f1 100644 --- a/tutorials/frontend/deploy_sparse.py +++ b/tutorials/frontend/deploy_sparse.py @@ -106,7 +106,7 @@ # appropriately for your specific machine. CUDA and ROCm are also supported. target = "llvm" # Which device to run on. Should be one of tvm.cpu() or tvm.gpu(). -ctx = tvm.cpu() +dev = tvm.cpu() # If true, then a sparse variant of the network will be run and # benchmarked. measure_sparse = True @@ -208,18 +208,18 @@ def import_graphdef( # the weights are sparse, we won't see any speedup because we are using # regular dense matrix multiplications on these dense (but mostly zero) # tensors instead of sparse aware kernels. -def run_relay_graph(mod, params, shape_dict, target, ctx): +def run_relay_graph(mod, params, shape_dict, target, dev): with relay.build_config(opt_level=3): lib = relay.build(mod, target=target, params=params) input_shape = shape_dict["input_1"] dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32") - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_runtime.GraphModule(lib["default"](dev)) m.set_input(0, dummy_data) m.run() tvm_output = m.get_output(0) - ftimer = m.module.time_evaluator("run", ctx, repeat=5, number=5) + ftimer = m.module.time_evaluator("run", dev, repeat=5, number=5) prof_res = np.array(ftimer().results) * 1000 print( "%-20s %-19s (%s)" @@ -228,9 +228,9 @@ def run_relay_graph(mod, params, shape_dict, target, ctx): return tvm_output -def run_dense(mod, params, shape_dict, target, ctx): +def run_dense(mod, params, shape_dict, target, dev): print("Dense Model Benchmark:") - return run_relay_graph(mod, params, shape_dict, target, ctx) + return run_relay_graph(mod, params, shape_dict, target, dev) ############################################################################### @@ -295,13 +295,13 @@ def deepcopy(param_dic): return new_params -def run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights): +def run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights): mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params) if gen_weights: params = random_sparse_bert_params(mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity) mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, 1), sparsity_threshold=0.8) print("Block Sparse Model with {blocksize}x1 blocks:".format(blocksize=bs_r)) - return run_relay_graph(mod, params, shape_dict, target, ctx) + return run_relay_graph(mod, params, shape_dict, target, dev) ############################################################################### @@ -312,10 +312,10 @@ def run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights # you'll need to uncomment the last line first. def benchmark(): mod, params, shape_dict = import_graphdef(name, batch_size, seq_len) - run_dense(mod, params, shape_dict, target, ctx) + run_dense(mod, params, shape_dict, target, dev) if measure_sparse: gen_weights = "prune" not in name - run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights) + run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights) # benchmark() diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py index 478aff255e0ca..9f31ab70731ac 100644 --- a/tutorials/frontend/deploy_ssd_gluoncv.py +++ b/tutorials/frontend/deploy_ssd_gluoncv.py @@ -100,10 +100,10 @@ def build(target): # enabled thrust during cmake by -DUSE_THRUST=ON. -def run(lib, ctx): +def run(lib, dev): # Build TVM runtime - m = graph_runtime.GraphModule(lib["default"](ctx)) - tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx) + m = graph_runtime.GraphModule(lib["default"](dev)) + tvm_input = tvm.nd.array(x.asnumpy(), device=dev) m.set_input("data", tvm_input) # execute m.run() @@ -113,10 +113,10 @@ def run(lib, ctx): for target in ["llvm", "cuda"]: - ctx = tvm.context(target, 0) - if ctx.exist: + dev = tvm.device(target, 0) + if dev.exist: lib = build(target) - class_IDs, scores, bounding_boxs = run(lib, ctx) + class_IDs, scores, bounding_boxs = run(lib, dev) ###################################################################### # Display result diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py index 34581c60db3c7..390fd2f320358 100644 --- a/tutorials/frontend/from_caffe2.py +++ b/tutorials/frontend/from_caffe2.py @@ -108,9 +108,9 @@ def transform_image(image): from tvm.contrib import graph_runtime # context x86 CPU, use tvm.gpu(0) if you run on GPU -ctx = tvm.cpu(0) +dev = tvm.cpu(0) # create a runtime executor module -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input(input_name, tvm.nd.array(data.astype("float32"))) # execute diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py index c868a7fe2899a..5703f5742d3dd 100644 --- a/tutorials/frontend/from_coreml.py +++ b/tutorials/frontend/from_coreml.py @@ -83,9 +83,9 @@ # The process is no different from other example from tvm.contrib import graph_runtime -ctx = tvm.cpu(0) +dev = tvm.cpu(0) dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("image", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py index fc770799f51ac..b76c32e831378 100644 --- a/tutorials/frontend/from_darknet.py +++ b/tutorials/frontend/from_darknet.py @@ -96,7 +96,7 @@ # compile the model target = "llvm" target_host = "llvm" -ctx = tvm.cpu(0) +dev = tvm.cpu(0) data = np.empty([batch_size, net.c, net.h, net.w], dtype) shape = {"data": data.shape} print("Compiling the model...") @@ -119,7 +119,7 @@ # The process is no different from other examples. from tvm.contrib import graph_runtime -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(data.astype(dtype))) diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py index 25a1e5c9d1fab..5f39a24c9b141 100644 --- a/tutorials/frontend/from_keras.py +++ b/tutorials/frontend/from_keras.py @@ -96,9 +96,9 @@ mod, params = relay.frontend.from_keras(keras_resnet50, shape_dict) # compile the model target = "cuda" -ctx = tvm.gpu(0) +dev = tvm.gpu(0) with tvm.transform.PassContext(opt_level=3): - executor = relay.build_module.create_executor("graph", mod, ctx, target) + executor = relay.build_module.create_executor("graph", mod, dev, target) ###################################################################### # Execute on TVM diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py index d103d17e5d24c..696af6b4a3a53 100644 --- a/tutorials/frontend/from_mxnet.py +++ b/tutorials/frontend/from_mxnet.py @@ -106,9 +106,9 @@ def transform_image(image): # Now, we would like to reproduce the same forward computation using TVM. from tvm.contrib import graph_runtime -ctx = tvm.gpu(0) +dev = tvm.gpu(0) dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py index b5bcdf6792f94..ecc3e9aef40c7 100644 --- a/tutorials/frontend/from_pytorch.py +++ b/tutorials/frontend/from_pytorch.py @@ -102,7 +102,7 @@ # Compile the graph to llvm target with given input specification. target = "llvm" target_host = "llvm" -ctx = tvm.cpu(0) +dev = tvm.cpu(0) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) @@ -113,7 +113,7 @@ from tvm.contrib import graph_runtime dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # Set inputs m.set_input(input_name, tvm.nd.array(img.astype(dtype))) # Execute diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py index 5cdc395886169..5a7c98105715f 100644 --- a/tutorials/frontend/from_tensorflow.py +++ b/tutorials/frontend/from_tensorflow.py @@ -73,11 +73,11 @@ # target = 'cuda' # target_host = 'llvm' # layout = "NCHW" -# ctx = tvm.gpu(0) +# dev = tvm.gpu(0) target = "llvm" target_host = "llvm" layout = None -ctx = tvm.cpu(0) +dev = tvm.cpu(0) ###################################################################### # Download required files @@ -155,7 +155,7 @@ from tvm.contrib import graph_runtime dtype = "uint8" -m = graph_runtime.GraphModule(lib["default"](ctx)) +m = graph_runtime.GraphModule(lib["default"](dev)) # set inputs m.set_input("DecodeJpeg/contents", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py index 8e7fcd70e3e9a..667dc2de1b9b2 100644 --- a/tutorials/frontend/using_external_lib.py +++ b/tutorials/frontend/using_external_lib.py @@ -77,9 +77,9 @@ target = "cuda" lib = relay.build_module.build(net, target, params=params) -ctx = tvm.context(target, 0) +dev = tvm.device(target, 0) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -module = runtime.GraphModule(lib["default"](ctx)) +module = runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() out_shape = (batch_size, out_channels, 224, 224) @@ -498,9 +498,9 @@ target = "cuda -libs=cudnn" # use cudnn for convolution lib = relay.build_module.build(net, target, params=params) -ctx = tvm.context(target, 0) +dev = tvm.device(target, 0) data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -module = runtime.GraphModule(lib["default"](ctx)) +module = runtime.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() out_shape = (batch_size, out_channels, 224, 224) diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py index 2386e7bdd1359..75985fccf1f3b 100644 --- a/tutorials/get_started/cross_compilation_and_rpc.py +++ b/tutorials/get_started/cross_compilation_and_rpc.py @@ -180,9 +180,9 @@ func = remote.load_module("lib.tar") # create arrays on the remote device -ctx = remote.cpu() -a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) -b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) +dev = remote.cpu() +a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) +b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) # the function will run on the remote device func(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) @@ -194,7 +194,7 @@ # function over number times, measures the cost per run on the remote # device and returns the measured cost. Network overhead is excluded. -time_f = func.time_evaluator(func.entry_name, ctx, number=10) +time_f = func.time_evaluator(func.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op" % cost) @@ -245,9 +245,9 @@ def run_opencl(): func = remote.load_module("lib_cl.tar") # run - ctx = remote.cl() - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + dev = remote.cl() + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) func(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) print("OpenCL test passed!") diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py index 444b915ca7c8f..86c4ac4399ed6 100644 --- a/tutorials/get_started/relay_quick_start.py +++ b/tutorials/get_started/relay_quick_start.py @@ -107,10 +107,10 @@ # Now we can create graph runtime and run the module on Nvidia GPU. # create random input -ctx = tvm.gpu() +dev = tvm.gpu() data = np.random.uniform(-1, 1, size=data_shape).astype("float32") # create module -module = graph_runtime.GraphModule(lib["default"](ctx)) +module = graph_runtime.GraphModule(lib["default"](dev)) # set input and parameters module.set_input("data", data) # run @@ -143,7 +143,7 @@ loaded_lib = tvm.runtime.load_module(path_lib) input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32")) -module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) +module = graph_runtime.GraphModule(loaded_lib["default"](dev)) module.run(data=input_data) out_deploy = module.get_output(0).asnumpy() diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py index 7f1bb6a3d1e51..a9952c2422e02 100644 --- a/tutorials/get_started/tensor_expr_get_started.py +++ b/tutorials/get_started/tensor_expr_get_started.py @@ -149,17 +149,17 @@ # We provide a minimal array API in python to aid quick testing and prototyping. # The array API is based on the `DLPack `_ standard. # -# - We first create a GPU context. +# - We first create a GPU device. # - Then tvm.nd.array copies the data to the GPU. # - fadd runs the actual computation. # - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness # -ctx = tvm.context(tgt, 0) +dev = tvm.device(tgt, 0) n = 1024 -a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) -b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) -c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) +a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) +b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) +c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) @@ -294,11 +294,11 @@ fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") print("------opencl code------") print(fadd_cl.imported_modules[0].get_source()) - ctx = tvm.cl(0) + dev = tvm.cl(0) n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) fadd_cl(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) diff --git a/tutorials/get_started/tune_matmul_x86.py b/tutorials/get_started/tune_matmul_x86.py index a51f01115f311..931f877595f58 100644 --- a/tutorials/get_started/tune_matmul_x86.py +++ b/tutorials/get_started/tune_matmul_x86.py @@ -150,18 +150,18 @@ def matmul_add(N, L, M, dtype): c_np = np.random.uniform(size=(N, M)).astype(np.float32) out_np = a_np.dot(b_np) + c_np -ctx = tvm.cpu() -a_tvm = tvm.nd.array(a_np, ctx=ctx) -b_tvm = tvm.nd.array(b_np, ctx=ctx) -c_tvm = tvm.nd.array(c_np, ctx=ctx) -out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) +dev = tvm.cpu() +a_tvm = tvm.nd.array(a_np, device=dev) +b_tvm = tvm.nd.array(b_np, device=dev) +c_tvm = tvm.nd.array(c_np, device=dev) +out_tvm = tvm.nd.empty(out_np.shape, device=dev) func(a_tvm, b_tvm, c_tvm, out_tvm) # Check results np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) # Evaluate execution time. -evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) print( "Execution time of this operator: %.3f ms" % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000) diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py index 794101a4fb569..277af712d90b3 100644 --- a/tutorials/language/extern_op.py +++ b/tutorials/language/extern_op.py @@ -77,11 +77,11 @@ # ----------------- # We can verify that the result matches what we expected. # -ctx = tvm.cpu(0) +dev = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") -a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) -b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) -d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) +a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) +b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) +d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5) @@ -125,8 +125,8 @@ def my_tvm_addone(x, y): ) s = te.create_schedule(B.op) f = tvm.build(s, [A, B], "llvm") -a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) -b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx) +a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) +b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5) diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py index cffa10e6cbb29..f782ac6ca280a 100644 --- a/tutorials/language/reduction.py +++ b/tutorials/language/reduction.py @@ -137,9 +137,9 @@ # Verify the correctness of result kernel by comparing it to numpy. # nn = 128 -ctx = tvm.gpu(0) -a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx) -b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) +dev = tvm.gpu(0) +a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev) +b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) fcuda(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4) diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py index 5f513208d56dd..8124b567177f1 100644 --- a/tutorials/language/scan.py +++ b/tutorials/language/scan.py @@ -83,12 +83,12 @@ # numpy to verify the correctness of the result. # fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan") -ctx = tvm.gpu(0) +dev = tvm.gpu(0) n = 1024 m = 10 a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype) -a = tvm.nd.array(a_np, ctx) -b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx) +a = tvm.nd.array(a_np, dev) +b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), dev) fscan(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0)) diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py index a75b78b65ca44..a1575fe832c30 100644 --- a/tutorials/language/tensorize.py +++ b/tutorials/language/tensorize.py @@ -186,11 +186,11 @@ def gemv_impl(): from tvm.topi.utils import get_const_tuple dtype = A.dtype -ctx = tvm.context("cpu", 0) +dev = tvm.device("cpu", 0) a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) -c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx) -func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c) +c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev) +func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3) ###################################################################### @@ -300,8 +300,8 @@ def _reduce_update(): func = tvm.build(s, [A, B, C], target="llvm", name="gemv") a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) -c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx) -func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c) +c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev) +func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3) ###################################################################### diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py index 6ad0da5aecbad..d86fc5838c43b 100644 --- a/tutorials/micro/micro_tflite.py +++ b/tutorials/micro/micro_tflite.py @@ -257,7 +257,7 @@ flasher = compiler.flasher() with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session: graph_mod = tvm.micro.create_local_graph_runtime( - graph, session.get_system_lib(), session.context + graph, session.get_system_lib(), session.device ) # Set the model parameters using the lowered parameters produced by `relay.build`. diff --git a/tutorials/optimize/opt_conv_cuda.py b/tutorials/optimize/opt_conv_cuda.py index 9cb29b573514a..0cecc82aa8ead 100644 --- a/tutorials/optimize/opt_conv_cuda.py +++ b/tutorials/optimize/opt_conv_cuda.py @@ -238,12 +238,12 @@ # func = tvm.build(s, [A, W, B], "cuda") -ctx = tvm.gpu(0) +dev = tvm.gpu(0) a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype) w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype) -a = tvm.nd.array(a_np, ctx) -w = tvm.nd.array(w_np, ctx) -b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx) +a = tvm.nd.array(a_np, dev) +w = tvm.nd.array(w_np, dev) +b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), dev) func(a, w, b) -evaluator = func.time_evaluator(func.entry_name, ctx, number=1) +evaluator = func.time_evaluator(func.entry_name, dev, number=1) print("Convolution: %f ms" % (evaluator(a, w, b).mean * 1e3)) diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py index 0cbcf7e033426..0a7798d1b9e18 100644 --- a/tutorials/optimize/opt_conv_tensorcore.py +++ b/tutorials/optimize/opt_conv_tensorcore.py @@ -392,16 +392,16 @@ def intrin_func(ins, outs): # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not # be able to run on our build server -ctx = tvm.gpu(0) -if nvcc.have_tensorcore(ctx.compute_version): +dev = tvm.gpu(0) +if nvcc.have_tensorcore(dev.compute_version): with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}): func = tvm.build(s, [A, W, Conv], "cuda") a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) - a = tvm.nd.array(a_np, ctx) - w = tvm.nd.array(w_np, ctx) - c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) - evaluator = func.time_evaluator(func.entry_name, ctx, number=10) + a = tvm.nd.array(a_np, dev) + w = tvm.nd.array(w_np, dev) + c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) + evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) ############################################################################### diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 971269d8c11e6..72a8b0a0701ef 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -75,11 +75,11 @@ # To get the best performance, please change the following line # to llvm -mcpu=core-avx2, or specific type of CPU you use target = "llvm" -ctx = tvm.context(target, 0) +dev = tvm.device(target, 0) # Random generated tensor for testing -a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx) -b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx) +a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev) +b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev) np_repeat = 100 np_runing_time = timeit.timeit( @@ -108,11 +108,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=1) +evaluator = func.time_evaluator(func.entry_name, dev, number=1) print("Baseline: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -143,13 +143,13 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops, # we can see big speedup compared with the baseline. -evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("Opt1: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -179,11 +179,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("Opt2: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -212,11 +212,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("Opt3: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -268,11 +268,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("Opt4: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -314,11 +314,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=10) +evaluator = func.time_evaluator(func.entry_name, dev, number=10) print("Opt5: %f" % evaluator(a, b, c).mean) ################################################################################################ @@ -357,11 +357,11 @@ func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) -evaluator = func.time_evaluator(func.entry_name, ctx, number=50) +evaluator = func.time_evaluator(func.entry_name, dev, number=50) opt6_time = evaluator(a, b, c).mean print("Opt6: %f" % opt6_time) diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py index f5450b9524c60..03682a05e86fa 100644 --- a/tutorials/optimize/opt_matmul_auto_tensorcore.py +++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py @@ -252,8 +252,8 @@ def test_gemm(N, L, M, dtype, layout): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): raise Exception("skip building this tutorial because cuda is not enabled..") -ctx = tvm.gpu() -if not nvcc.have_tensorcore(ctx.compute_version): +dev = tvm.gpu() +if not nvcc.have_tensorcore(dev.compute_version): raise Exception("the gpu has no tensorcore, skipping...") M, N, L = 512, 32, 512 @@ -385,14 +385,14 @@ def tune_and_evaluate(M, N, L, dtype, layout): for k in range(32): b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xF) << (31 - k)) - c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx) - a_tvm = tvm.nd.array(a_np, ctx=ctx) - b_tvm = tvm.nd.array(b_np, ctx=ctx) + c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), device=dev) + a_tvm = tvm.nd.array(a_np, device=dev) + b_tvm = tvm.nd.array(b_np, device=dev) func(a_tvm, b_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3) - evaluator = func.time_evaluator(func.entry_name, ctx, number=100) + evaluator = func.time_evaluator(func.entry_name, dev, number=100) print("Time cost of this operator: %f" % evaluator(a_tvm, b_tvm, c_tvm).mean) diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py index 2459cf2495065..1fefae585a2f6 100644 --- a/tutorials/topi/intro_topi.py +++ b/tutorials/topi/intro_topi.py @@ -99,13 +99,13 @@ # We can test the correctness by comparing with :code:`numpy` result as follows # func = tvm.build(sg, [a, b, g], "cuda") -ctx = tvm.gpu(0) +dev = tvm.gpu(0) a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype) b_np = np.random.uniform(size=(y, y)).astype(b.dtype) g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0) -a_nd = tvm.nd.array(a_np, ctx) -b_nd = tvm.nd.array(b_np, ctx) -g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) +a_nd = tvm.nd.array(a_np, dev) +b_nd = tvm.nd.array(b_np, dev) +g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5) diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 99e67481e4d96..8998f57123813 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -201,8 +201,8 @@ class ExprDeviceAnnot(ExprMutator): """ def __init__(self, start=-1, end=-1): - self.ext_ctx = tvm.context("ext_dev") - self.cpu_ctx = tvm.context("cpu") + self.ext_dev = tvm.device("ext_dev") + self.cpu_dev = tvm.device("cpu") self.cast = op.op.get("cast") self.counter = -1 self.start = start @@ -217,12 +217,12 @@ def visit_call(self, call): self.counter += 1 if self.counter == self.start: ret = relay.Call(call.op, args, call.attrs) - ret = relay.annotation.on_device(ret, self.ext_ctx) + ret = relay.annotation.on_device(ret, self.ext_dev) return ret if self.counter == self.end: ret = relay.Call(call.op, args, call.attrs) - ret = relay.annotation.on_device(ret, self.cpu_ctx) + ret = relay.annotation.on_device(ret, self.cpu_dev) return ret if self.counter > self.start and self.counter < self.end: @@ -232,7 +232,7 @@ def visit_call(self, call): if self.is_float_op(call): return ret - return relay.annotation.on_device(ret, self.ext_ctx) + return relay.annotation.on_device(ret, self.ext_dev) return relay.Call(self.visit(call.op), args, call.attrs) diff --git a/vta/runtime/device_api.cc b/vta/runtime/device_api.cc index 0fea7ba5e3645..b021ed1039336 100644 --- a/vta/runtime/device_api.cc +++ b/vta/runtime/device_api.cc @@ -33,38 +33,38 @@ namespace runtime { class VTADeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final {} + void SetDevice(Device dev) final {} - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { if (kind == kExist) { *rv = 1; } } - void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final { + void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final { return VTABufferAlloc(size); } - void FreeDataSpace(TVMContext ctx, void* ptr) final { VTABufferFree(ptr); } + void FreeDataSpace(Device dev, void* ptr) final { VTABufferFree(ptr); } void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { int kind_mask = 0; - if (ctx_from.device_type != kDLCPU) { + if (dev_from.device_type != kDLCPU) { kind_mask |= 2; } - if (ctx_to.device_type != kDLCPU) { + if (dev_to.device_type != kDLCPU) { kind_mask |= 1; } VTABufferCopy(from, from_offset, to, to_offset, size, kind_mask); } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {} + void StreamSync(Device dev, TVMStreamHandle stream) final {} - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; - void FreeWorkspace(TVMContext ctx, void* data) final; + void FreeWorkspace(Device dev, void* data) final; static VTADeviceAPI* Global() { static VTADeviceAPI* inst = new VTADeviceAPI(); @@ -76,12 +76,12 @@ struct VTAWorkspacePool : public WorkspacePool { VTAWorkspacePool() : WorkspacePool(kDLExtDev, VTADeviceAPI::Global()) {} }; -void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) { - return dmlc::ThreadLocalStore::Get()->AllocWorkspace(ctx, size); +void* VTADeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { + return dmlc::ThreadLocalStore::Get()->AllocWorkspace(dev, size); } -void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) { - dmlc::ThreadLocalStore::Get()->FreeWorkspace(ctx, data); +void VTADeviceAPI::FreeWorkspace(Device dev, void* data) { + dmlc::ThreadLocalStore::Get()->FreeWorkspace(dev, data); } // Register device api with override. diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py index 824aed6efa028..f69766ceaaad2 100644 --- a/vta/tests/python/integration/test_benchmark_gemm.py +++ b/vta/tests/python/integration/test_benchmark_gemm.py @@ -66,7 +66,7 @@ def verify(s): remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) # Data in original format data_orig = np.random.randint(-128, 128, size=(batch_size, channel)).astype(data.dtype) weight_orig = np.random.randint(-128, 128, size=(channel, channel)).astype(weight.dtype) @@ -77,9 +77,9 @@ def verify(s): channel // env.BLOCK_OUT, env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_IN ).transpose((0, 2, 1, 3)) res_np = np.zeros(res_shape).astype(res.dtype) - data_arr = tvm.nd.array(data_packed, ctx) - weight_arr = tvm.nd.array(weight_packed, ctx) - res_arr = tvm.nd.array(res_np, ctx) + data_arr = tvm.nd.array(data_packed, dev) + weight_arr = tvm.nd.array(weight_packed, dev) + res_arr = tvm.nd.array(res_np, dev) res_ref = np.zeros(res_shape).astype(env.acc_dtype) for b in range(batch_size // env.BATCH): for i in range(channel // env.BLOCK_OUT): @@ -90,7 +90,7 @@ def verify(s): ) res_ref = np.right_shift(res_ref, 8) res_ref = np.clip(res_ref, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) - time_f = f.time_evaluator("gemm", ctx, number=20) + time_f = f.time_evaluator("gemm", dev, number=20) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() cost = time_f(data_arr, weight_arr, res_arr) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index cad560c208b65..b82c3a90c9d00 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -228,14 +228,14 @@ def get_ref_data(): mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") - ctx = remote.context(str(target)) + dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) - data_arr = tvm.nd.array(data_np, ctx) - kernel_arr = tvm.nd.array(kernel_np, ctx) - bias_arr = tvm.nd.array(bias_np, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=samples) + data_arr = tvm.nd.array(data_np, dev) + kernel_arr = tvm.nd.array(kernel_np, dev) + bias_arr = tvm.nd.array(bias_np, dev) + res_arr = tvm.nd.array(res_np, dev) + time_f = f.time_evaluator("conv2d", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py index f750225ed8f75..d2516faac00bb 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py @@ -224,13 +224,13 @@ def get_ref_data(): mod.save(temp.relpath("conv2d_transpose.o")) remote.upload(temp.relpath("conv2d_transpose.o")) f = remote.load_module("conv2d_transpose.o") - ctx = remote.context(str(target)) + dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) - data_arr = tvm.nd.array(data_np, ctx) - kernel_arr = tvm.nd.array(kernel_np, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d_transpose", ctx, number=samples) + data_arr = tvm.nd.array(data_np, dev) + kernel_arr = tvm.nd.array(kernel_np, dev) + res_arr = tvm.nd.array(res_np, dev) + time_f = f.time_evaluator("conv2d_transpose", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py index 0b604108a35fb..ceeed1ce8ddb0 100644 --- a/vta/tests/python/integration/test_benchmark_topi_dense.py +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -135,13 +135,13 @@ def get_ref_data(): mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") - ctx = remote.context(str(target)) + dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) - data_arr = tvm.nd.array(data_np, ctx) - kernel_arr = tvm.nd.array(kernel_np, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("dense", ctx, number=samples) + data_arr = tvm.nd.array(data_np, dev) + kernel_arr = tvm.nd.array(kernel_np, dev) + res_arr = tvm.nd.array(res_np, dev) + time_f = f.time_evaluator("dense", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py index da6ba5b8fb949..b7c7b0aa0a8d8 100644 --- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py @@ -222,14 +222,14 @@ def get_ref_data(): mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") - ctx = remote.context(str(target)) + dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) - data_arr = tvm.nd.array(data_np, ctx) - kernel_arr = tvm.nd.array(kernel_np, ctx) - bias_arr = tvm.nd.array(bias_np, ctx) - res_arr = tvm.nd.array(res_np, ctx) - time_f = f.time_evaluator("conv2d", ctx, number=samples) + data_arr = tvm.nd.array(data_np, dev) + kernel_arr = tvm.nd.array(kernel_np, dev) + bias_arr = tvm.nd.array(bias_np, dev) + res_arr = tvm.nd.array(res_np, dev) + time_f = f.time_evaluator("conv2d", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py index b83510f4a9dcc..2817ef01b5fa0 100644 --- a/vta/tests/python/unittest/test_vta_insn.py +++ b/vta/tests/python/unittest/test_vta_insn.py @@ -59,11 +59,11 @@ def _run(env, remote): remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = x_np.astype(y.dtype) - x_nd = tvm.nd.array(x_np, ctx) - y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) + x_nd = tvm.nd.array(x_np, dev) + y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() @@ -130,7 +130,7 @@ def check_padded_load(pad_before, pad_after, test_name=None): remote.upload(temp.relpath("padded_load.o")) f = remote.load_module("padded_load.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) x_np = np.random.randint(0, 10, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = np.zeros( ( @@ -141,8 +141,8 @@ def check_padded_load(pad_before, pad_after, test_name=None): ) ).astype(y.dtype) y_np[pad_before[0] : pad_before[0] + n, pad_before[1] : pad_before[1] + m, :] = x_np - x_nd = tvm.nd.array(x_np, ctx) - y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) + x_nd = tvm.nd.array(x_np, dev) + y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() @@ -214,7 +214,7 @@ def verify(s, name=None): remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype( x.dtype ) @@ -222,9 +222,9 @@ def verify(s, name=None): w.dtype ) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) - x_nd = tvm.nd.array(x_np, ctx) - w_nd = tvm.nd.array(w_np, ctx) - y_nd = tvm.nd.array(y_np, ctx) + x_nd = tvm.nd.array(x_np, dev) + w_nd = tvm.nd.array(w_np, dev) + y_nd = tvm.nd.array(y_np, dev) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): @@ -376,7 +376,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) @@ -386,8 +386,8 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): ) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) - a_nd = tvm.nd.array(a_np, ctx) - res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() @@ -395,7 +395,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): if use_imm: f(a_nd, res_nd) else: - b_nd = tvm.nd.array(b_np, ctx) + b_nd = tvm.nd.array(b_np, dev) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) @@ -459,11 +459,11 @@ def _run(env, remote): remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) - a_nd = tvm.nd.array(a_np, ctx) - res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() @@ -521,12 +521,12 @@ def _run(env, remote): remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.right_shift((a_np + imm_shift), imm_scale) res_np = res_np.astype(res.dtype) - a_nd = tvm.nd.array(a_np, ctx) - res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) + a_nd = tvm.nd.array(a_np, dev) + res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() @@ -547,9 +547,9 @@ def _run(env, remote): def test_runtime_array(): def _run(env, remote): n = 100 - ctx = remote.ext_dev(0) + dev = remote.ext_dev(0) x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8") - x_nd = tvm.nd.array(x_np, ctx) + x_nd = tvm.nd.array(x_np, dev) np.testing.assert_equal(x_np, x_nd.asnumpy()) vta.testing.run(_run) diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 12f930f491a59..101f53ac64a0a 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -180,14 +180,14 @@ class AsyncLocalSession : public LocalSession { try { DLTensor local_from; local_from.data = local_from_bytes; - local_from.ctx = TVMContext{kDLCPU, 0}; + local_from.device = Device{kDLCPU, 0}; local_from.ndim = remote_to->ndim; local_from.shape = remote_to->shape; local_from.dtype = remote_to->dtype; local_from.strides = nullptr; local_from.byte_offset = 0; - this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr); - this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete); + this->GetDeviceAPI(remote_to->device)->CopyDataFromTo(&local_from, remote_to, nullptr); + this->AsyncStreamWait(remote_to->device, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } @@ -198,27 +198,27 @@ class AsyncLocalSession : public LocalSession { try { DLTensor local_to; local_to.data = local_to_bytes; - local_to.ctx = TVMContext{kDLCPU, 0}; + local_to.device = Device{kDLCPU, 0}; local_to.ndim = remote_from->ndim; local_to.shape = remote_from->shape; local_to.dtype = remote_from->dtype; local_to.strides = nullptr; local_to.byte_offset = 0; - this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr); - this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete); + this->GetDeviceAPI(remote_from->device)->CopyDataFromTo(&local_to, remote_from, nullptr); + this->AsyncStreamWait(remote_from->device, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } } - void AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, FAsyncCallback on_complete) final { - if (ctx.device_type == kDLCPU) { + void AsyncStreamWait(Device dev, TVMStreamHandle stream, FAsyncCallback on_complete) final { + if (dev.device_type == kDLCPU) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; on_complete(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); } else { - CHECK(ctx.device_type == static_cast(kDLWebGPU)); + CHECK(dev.device_type == static_cast(kDLWebGPU)); if (async_wait_ == nullptr) { async_wait_ = tvm::runtime::Registry::Get("__async.wasm.WebGPUWaitForTasks"); } @@ -242,25 +242,25 @@ class AsyncLocalSession : public LocalSession { // time evaluator PackedFunc GetTimeEvaluator(Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms) { - TVMContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; + Device dev; + dev.device_type = static_cast(device_type); + dev.device_id = device_id; if (opt_mod.defined()) { Module m = opt_mod.value(); std::string tkey = m->type_key(); - return WrapWasmTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms); + return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms); } else { auto* pf = runtime::Registry::Get(name); CHECK(pf != nullptr) << "Cannot find " << name << " in the global function"; - return WrapWasmTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms); + return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms); } } // time evaluator - PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat, + PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms) { - auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) { // the function is a async function. PackedFunc on_complete = args[args.size() - 1]; // keep argument alive in finvoke so that they @@ -277,7 +277,7 @@ class AsyncLocalSession : public LocalSession { }; auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution"); CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; - (*time_exec)(TypedPackedFunc(finvoke), ctx, number, repeat, min_repeat_ms, + (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, on_complete); }; return PackedFunc(ftimer); diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index 01e42ef3faa85..e98ca5e852197 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -80,35 +80,34 @@ class WebGPUDeviceAPI : public DeviceAPI { copy_within_gpu_ = getter("deviceCopyWithinGPU"); } - void SetDevice(TVMContext ctx) final {} - void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + void SetDevice(Device dev) final {} + void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final { if (kind == kExist) { *rv = 1; } } - void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, - DLDataType type_hint) final { + void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final { double ptr_number = alloc_space_(nbytes); return reinterpret_cast(static_cast(ptr_number)); } - void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); } + void FreeDataSpace(Device dev, void* ptr) final { return free_space_(ptr); } protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + Device dev_from, Device dev_to, DLDataType type_hint, TVMStreamHandle stream) final { - if (static_cast(ctx_from.device_type) == kDLWebGPU && - static_cast(ctx_to.device_type) == kDLWebGPU) { - CHECK_EQ(ctx_from.device_id, ctx_to.device_id); + if (static_cast(dev_from.device_type) == kDLWebGPU && + static_cast(dev_to.device_type) == kDLWebGPU) { + CHECK_EQ(dev_from.device_id, dev_to.device_id); copy_within_gpu_(const_cast(from), from_offset, to, to_offset, size); - } else if (static_cast(ctx_from.device_type) == kDLWebGPU && - ctx_to.device_type == kDLCPU) { + } else if (static_cast(dev_from.device_type) == kDLWebGPU && + dev_to.device_type == kDLCPU) { void* to_ptr = static_cast(to) + to_offset; copy_from_gpu_(const_cast(from), from_offset, to_ptr, size); - } else if (ctx_from.device_type == kDLCPU && - static_cast(ctx_to.device_type) == kDLWebGPU) { + } else if (dev_from.device_type == kDLCPU && + static_cast(dev_to.device_type) == kDLWebGPU) { void* from_ptr = static_cast(const_cast(from)) + from_offset; copy_to_gpu_(from_ptr, to, to_offset, size); } else { @@ -117,34 +116,34 @@ class WebGPUDeviceAPI : public DeviceAPI { } public: - TVMStreamHandle CreateStream(TVMContext ctx) final { + TVMStreamHandle CreateStream(Device dev) final { LOG(FATAL) << "Not implemented"; return nullptr; } - void FreeStream(TVMContext ctx, TVMStreamHandle stream) final { + void FreeStream(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; return; } - void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) { + void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { LOG(FATAL) << "Not implemented"; return; } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; } + void StreamSync(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; } - void SetStream(TVMContext ctx, TVMStreamHandle stream) final { + void SetStream(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; return; } - void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final { - return WebGPUThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); + void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { + return WebGPUThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); } - void FreeWorkspace(TVMContext ctx, void* data) final { - WebGPUThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); + void FreeWorkspace(Device dev, void* data) final { + WebGPUThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); } static WebGPUDeviceAPI* Global() { diff --git a/web/src/ctypes.ts b/web/src/ctypes.ts index 66c46fe7ed91a..4a6d25ae62706 100644 --- a/web/src/ctypes.ts +++ b/web/src/ctypes.ts @@ -204,7 +204,7 @@ export const enum SizeOf { F64 = 8, TVMValue = 8, DLDataType = I32, - DLContext = I32 + I32, + DLDevice = I32 + I32, } /** @@ -217,7 +217,7 @@ export const enum ArgTypeCode { TVMOpaqueHandle = 3, Null = 4, TVMDataType = 5, - TVMContext = 6, + DLDevice = 6, TVMDLTensorHandle = 7, TVMObjectHandle = 8, TVMModuleHandle = 9, diff --git a/web/src/index.ts b/web/src/index.ts index 2d99fc9106ccc..ed84ce7fbea12 100644 --- a/web/src/index.ts +++ b/web/src/index.ts @@ -18,7 +18,7 @@ */ export { - Scalar, DLContext, DLDataType, + Scalar, DLDevice, DLDataType, PackedFunc, Module, NDArray, Instance, instantiate } from "./runtime"; diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 80e7d71f06ad3..968dd9acbb5be 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -194,8 +194,8 @@ const DeviceStrToEnum: Record = { /** * Represent a runtime context where a NDArray can reside. */ -export class DLContext { - /** The device type code of the context. */ +export class DLDevice { + /** The device type code of the device. */ deviceType: number; /** The device index. */ deviceId: number; @@ -219,7 +219,7 @@ export class DLContext { } /** - * Synchronize the context + * Synchronize the device */ async sync(): Promise { if (this.deviceType == DeviceStrToEnum.webgpu) { @@ -294,8 +294,8 @@ export class NDArray implements Disposable { dtype: string; /** Shape of the array. */ shape: Array; - /** Context of the array. */ - context: DLContext; + /** Device of the array. */ + device: DLDevice; /** Whether it is a temporary view that can become invalid after the call. */ private isView: boolean; private byteOffset: number; @@ -319,7 +319,7 @@ export class NDArray implements Disposable { const arrayOffsetContext = arrayOffsetData + this.lib.sizeofPtr(); const arrayOffsetDevType = arrayOffsetContext; const arrayOffsetDevId = arrayOffsetContext + SizeOf.I32; - const arrayOffsetNdim = arrayOffsetContext + SizeOf.DLContext; + const arrayOffsetNdim = arrayOffsetContext + SizeOf.DLDevice; const arrayOffsetDtype = arrayOffsetNdim + SizeOf.I32; const arrayOffsetDtypeCode = arrayOffsetDtype; const arrayOffsetDtypeBits = arrayOffsetDtype + SizeOf.U8; @@ -344,10 +344,10 @@ export class NDArray implements Disposable { this.dlDataType = new DLDataType(code, bits, lanes); this.dtype = this.dlDataType.toString(); - // ctx + // device const deviceType = lib.memory.loadI32(this.dltensor + arrayOffsetDevType); const deviceId = lib.memory.loadI32(this.dltensor + arrayOffsetDevId); - this.context = new DLContext(deviceType, deviceId, lib); + this.device = new DLDevice(deviceType, deviceId, lib); // byte_offset this.byteOffset = lib.memory.loadI64(this.dltensor + arrayOffsetByteOffset); @@ -442,7 +442,7 @@ export class NDArray implements Disposable { * @returns The result array. */ toRawBytes(): Uint8Array { - if (this.context.deviceType != DeviceStrToEnum.cpu) { + if (this.device.deviceType != DeviceStrToEnum.cpu) { throw new Error("Can only synchronize copy for GPU array, use copyfrom instead."); } const size = this.shape.reduce((a, b) => { @@ -648,22 +648,22 @@ class GraphRuntime implements Disposable { /** * Benchmark stable execution of the graph(without data copy). - * @params ctx The context to sync during each run. + * @params dev The device to sync during each run. * @number The number of times to compute the average. * @repeat The number of times to repeat the run. */ - async benchmarkRuns(ctx: DLContext, number=10, repeat=4): Promise { + async benchmarkRuns(dev: DLDevice, number=10, repeat=4): Promise { // Skip first run as it can involve GPU warmup and module loading time. const perf = compact.getPeformance(); const results = []; this.run(); - await ctx.sync(); + await dev.sync(); for (let k = 0; k < repeat; ++k) { const tstart = perf.now(); for (let i = 0; i < number; ++i) { this.run(); } - await ctx.sync(); + await dev.sync(); const tend = perf.now(); results.push((tend - tstart) / number); } @@ -917,29 +917,29 @@ export class Instance implements Disposable { } /** - * Create a new {@link DLContext} + * Create a new {@link DLDevice} * @param deviceType The device type. * @param deviceId The device index. - * @returns The created context. + * @returns The created device. */ - context(deviceType: number | string, deviceId = 0): DLContext { - return new DLContext(deviceType, deviceId, this.lib); + device(deviceType: number | string, deviceId = 0): DLDevice { + return new DLDevice(deviceType, deviceId, this.lib); } /** - * Create a new cpu {@link DLContext} + * Create a new cpu {@link DLDevice} * @param deviceId The device index. */ - cpu(deviceId = 0): DLContext { - return this.context("cpu", deviceId); + cpu(deviceId = 0): DLDevice { + return this.device("cpu", deviceId); } /** - * Create a new webgpu {@link DLContext} + * Create a new webgpu {@link DLDevice} * @param deviceId The device index. */ - webgpu(deviceId = 0): DLContext { - return this.context("webgpu", deviceId); + webgpu(deviceId = 0): DLDevice { + return this.device("webgpu", deviceId); } /** @@ -947,13 +947,13 @@ export class Instance implements Disposable { * * @param shape The shape of the array. * @param dtype The data type of the array. - * @param ctx The context of the ndarray. + * @param dev The device of the ndarray. * @returns The created ndarray. */ empty( shape: Array | number, dtype: string | DLDataType = "float32", - ctx: DLContext = this.context("cpu", 0) + dev: DLDevice = this.device("cpu", 0) ): NDArray { dtype = this.toDLDataType(dtype); shape = typeof shape == "number" ? [shape] : shape; @@ -975,8 +975,8 @@ export class Instance implements Disposable { dtype.code, dtype.bits, dtype.lanes, - ctx.deviceType, - ctx.deviceId, + dev.deviceType, + dev.deviceId, outPtr ) ); @@ -990,19 +990,19 @@ export class Instance implements Disposable { * * @param graphJson The graph runtime json file. * @param lib The underlying library. - * @param ctx The execution context of the graph. + * @param dev The execution device of the graph. */ createGraphRuntime( graphJson: string, lib: Module, - ctx: DLContext + dev: DLDevice ): GraphRuntime { const fcreate = this.getGlobalFunc("tvm.graph_runtime.create"); const module = fcreate( graphJson, lib, - this.scalar(ctx.deviceType, "int32"), - this.scalar(ctx.deviceId, "int32")) as Module; + this.scalar(dev.deviceType, "int32"), + this.scalar(dev.deviceId, "int32")) as Module; return new GraphRuntime(module); } @@ -1059,13 +1059,13 @@ export class Instance implements Disposable { // Helper function to time the finvoke const timeExecution = async ( finvoke: PackedFunc, - ctx: DLContext, + dev: DLDevice, nstep: number, repeat: number, minRepeatMs: number ): Promise => { finvoke(this.scalar(1, "int32")); - await ctx.sync(); + await dev.sync(); const result = []; let setupNumber: number = nstep; @@ -1079,7 +1079,7 @@ export class Instance implements Disposable { } const tstart: number = perf.now(); finvoke(this.scalar(setupNumber, "int32")); - await ctx.sync(); + await dev.sync(); const tend: number = perf.now(); durationMs = tend - tstart; @@ -1162,10 +1162,10 @@ export class Instance implements Disposable { stack.storePtr(valueOffset, val.value); stack.storeI32(codeOffset, ArgTypeCode.TVMOpaqueHandle); } - } else if (val instanceof DLContext) { + } else if (val instanceof DLDevice) { stack.storeI32(valueOffset, val.deviceType); stack.storeI32(valueOffset + SizeOf.I32, val.deviceType); - stack.storeI32(codeOffset, ArgTypeCode.TVMContext); + stack.storeI32(codeOffset, ArgTypeCode.DLDevice); } else if (tp == "number") { stack.storeF64(valueOffset, val); stack.storeI32(codeOffset, ArgTypeCode.Float); @@ -1328,10 +1328,10 @@ export class Instance implements Disposable { ); } case ArgTypeCode.Null: return undefined; - case ArgTypeCode.TVMContext: { + case ArgTypeCode.DLDevice: { const deviceType = this.memory.loadI32(rvaluePtr); const deviceId = this.memory.loadI32(rvaluePtr + SizeOf.I32); - return this.context(deviceType, deviceId); + return this.device(deviceType, deviceId); } case ArgTypeCode.TVMStr: { const ret = this.memory.loadCString(this.memory.loadPointer(rvaluePtr)); diff --git a/web/tests/node/test_ndarray.js b/web/tests/node/test_ndarray.js index eb0a8f446d4c0..9e50557e2a13e 100644 --- a/web/tests/node/test_ndarray.js +++ b/web/tests/node/test_ndarray.js @@ -36,7 +36,7 @@ function testArrayCopy(dtype, arrayType) { let data = [1, 2, 3, 4, 5, 6]; let a = tvm.empty([2, 3], dtype).copyFrom(data); - assert(a.context.toString() == "cpu(0)"); + assert(a.device.toString() == "cpu(0)"); assert(a.shape[0] == 2 && a.shape[1] == 3); let ret = a.toArray(); diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py index 5efc85cf5e324..80e358b564982 100644 --- a/web/tests/python/webgpu_rpc_test.py +++ b/web/tests/python/webgpu_rpc_test.py @@ -65,10 +65,10 @@ def test_rpc(): def check(remote): # basic function checks. - ctx = remote.webgpu(0) + dev = remote.webgpu(0) adata = np.random.uniform(size=n).astype(A.dtype) - a = tvm.nd.array(adata, ctx) - b = tvm.nd.array(np.zeros(n, dtype=A.dtype), ctx) + a = tvm.nd.array(adata, dev) + b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev) np.testing.assert_equal(a.asnumpy(), adata) f1 = remote.system_lib() diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py index 48603e86b7f79..9c1876a182e35 100644 --- a/web/tests/python/websock_rpc_test.py +++ b/web/tests/python/websock_rpc_test.py @@ -70,15 +70,15 @@ def check(remote): # run the generated library. f1 = remote.system_lib() - ctx = remote.cpu(0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + dev = remote.cpu(0) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) # invoke the function addone = f1.get_function("addone") addone(a, b) # time evaluator - time_f = f1.time_evaluator("addone", ctx, number=100, repeat=10) + time_f = f1.time_evaluator("addone", dev, number=100, repeat=10) time_f(a, b) cost = time_f(a, b).mean print("%g secs/op" % cost) From f4490708a559930c6554eec26c377d49bfb46d84 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 26 Mar 2021 04:57:27 -0700 Subject: [PATCH 72/84] [Bugfix] Fix usages of logging-related macros (#7748) --- apps/android_camera/app/src/main/jni/Application.mk | 2 +- apps/android_camera/app/src/main/jni/tvm_runtime.h | 2 +- apps/android_deploy/app/src/main/jni/Application.mk | 2 +- apps/android_deploy/app/src/main/jni/tvm_runtime.h | 2 +- apps/android_rpc/app/src/main/jni/Application.mk | 2 +- apps/android_rpc/app/src/main/jni/tvm_runtime.h | 2 +- apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj | 4 ++-- src/runtime/object.cc | 2 +- tests/lint/check_file_type.py | 2 -- web/emcc/tvmjs_support.cc | 9 +++++++-- web/emcc/wasm_runtime.cc | 9 +++++++-- web/emcc/webgpu_runtime.cc | 5 ++++- 12 files changed, 27 insertions(+), 16 deletions(-) diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk index 5c8774889685e..b6124a86df873 100644 --- a/apps/android_camera/app/src/main/jni/Application.mk +++ b/apps/android_camera/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= all APP_STL := c++_shared -APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h index 47a3a3de6bba3..1fd19b29e20bf 100644 --- a/apps/android_camera/app/src/main/jni/tvm_runtime.h +++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h @@ -26,7 +26,7 @@ #include #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 +#define TVM_USE_LIBBACKTRACE 0 /* Enable custom logging - this will cause TVM to use a custom implementation * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to * Android logcat. diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk index 42c4f232a5536..a50a40bf5cd19 100644 --- a/apps/android_deploy/app/src/main/jni/Application.mk +++ b/apps/android_deploy/app/src/main/jni/Application.mk @@ -27,7 +27,7 @@ include $(config) APP_STL := c++_static -APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h index 4412e9c62e9d5..6fe8c98756bb8 100644 --- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -26,7 +26,7 @@ #include #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 +#define TVM_USE_LIBBACKTRACE 0 #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 088eeed750b89..5f885f1c6f144 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips APP_STL := c++_shared -APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index 40e6279fb386d..0745c30ffb1d4 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -26,7 +26,7 @@ #include #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 +#define TVM_USE_LIBBACKTRACE 0 /* Enable custom logging - this will cause TVM to use a custom implementation * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to * Android logcat. diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj index 28079e710a386..6f00c03a26d74 100644 --- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj +++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj @@ -350,7 +350,7 @@ "DEBUG=1", "$(inherited)", "DMLC_USE_LOGGING_LIBRARY=", - "TVM_BACKTRACE_DISABLED=1", + "TVM_USE_LIBBACKTRACE=0", ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; @@ -397,7 +397,7 @@ GCC_NO_COMMON_BLOCKS = YES; GCC_PREPROCESSOR_DEFINITIONS = ( "DMLC_USE_LOGGING_LIBRARY=", - "TVM_BACKTRACE_DISABLED=1", + "TVM_USE_LIBBACKTRACE=0", ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; diff --git a/src/runtime/object.cc b/src/runtime/object.cc index c9a9669671e62..23594b5d7d8af 100644 --- a/src/runtime/object.cc +++ b/src/runtime/object.cc @@ -88,7 +88,7 @@ class TypeContext { } // try to allocate from parent's type table. ICHECK_LT(parent_tindex, type_table_.size()) - << " skey= " << skey << "static_index=" << static_tindex; + << " skey=" << skey << ", static_index=" << static_tindex; TypeInfo& pinfo = type_table_[parent_tindex]; ICHECK_EQ(pinfo.index, parent_tindex); diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index f5c0de0a50b02..ab51b6c79c83b 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -131,8 +131,6 @@ # microTVM Virtual Machines "apps/microtvm/reference-vm/zephyr/Vagrantfile", "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template", - # patch file for libbacktrace - "cmake/modules/libbacktrace_macos.patch", } diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 101f53ac64a0a..6f4d2e5932daf 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -24,10 +24,15 @@ * We do not need to link this file in standalone wasm. */ -// configurations for the dmlc log. +// configurations for tvm logging +#define DMLC_LOG_CUSTOMIZE 0 +#define DMLC_LOG_STACK_TRACE 0 +#define DMLC_LOG_DEBUG 0 +#define DMLC_LOG_NODATE 1 +#define DMLC_LOG_FATAL_THROW 0 #define TVM_LOG_DEBUG 0 #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 +#define TVM_USE_LIBBACKTRACE 0 #include #include diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index 0b14ef6476d20..287604e110a9b 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -22,10 +22,15 @@ * \brief TVM wasm runtime library pack. */ -// configurations for the dmlc log. +// configurations for tvm logging +#define DMLC_LOG_CUSTOMIZE 0 +#define DMLC_LOG_STACK_TRACE 0 +#define DMLC_LOG_DEBUG 0 +#define DMLC_LOG_NODATE 1 +#define DMLC_LOG_FATAL_THROW 0 #define TVM_LOG_DEBUG 0 #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 +#define TVM_USE_LIBBACKTRACE 0 #include #include diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index e98ca5e852197..3dc8be251f99c 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -22,10 +22,13 @@ * \brief WebGPU runtime based on the TVM JS. */ -// configurations for tvm logging. +// configurations for tvm logging #define TVM_LOG_DEBUG 0 #define DMLC_USE_LOGGING_LIBRARY #define TVM_BACKTRACE_DISABLED 1 +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_USE_LIBBACKTRACE 0 #include #include From f74dac76b2fc68b2c6f844fd2a0277629a92b1d7 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 26 Mar 2021 10:00:38 -0700 Subject: [PATCH 73/84] Bump ci-cpu and ci-arm container versions. (#7745) --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8f11bbaa568d2..f7fc6e4e21787 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -46,11 +46,11 @@ // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> ci_lint = "tlcpack/ci-lint:v0.62" ci_gpu = "tlcpack/ci-gpu:v0.72" -ci_cpu = "tlcpack/ci-cpu:v0.72-t0" +ci_cpu = "tlcpack/ci-cpu:v0.73" ci_wasm = "tlcpack/ci-wasm:v0.70" ci_i386 = "tlcpack/ci-i386:v0.72-t0" ci_qemu = "tlcpack/ci-qemu:v0.02" -ci_arm = "tlcpack/ci-arm:v0.02" +ci_arm = "tlcpack/ci-arm:v0.03" // <--- End of regex-scanned config. // tvm libraries From 14f829a948498467f41a3e69eaaf88b8d347b3a9 Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Fri, 26 Mar 2021 15:53:20 -0700 Subject: [PATCH 74/84] [docs] Getting Started with TVM: AutoTVM and Matrix Multiply (#7643) * [docs] Getting Started with TVM: AutoTVM and Matrix Multiply This patch moves the matrix multiplcation example tuning with AutoTVM to the tutorial directory, and expands on the content. This follows and builds on the section on TE * Applying lint style * Fix license * Apply suggestions from code review Co-authored-by: Siyuan Feng * Change comparison tolerance to smaller value Co-authored-by: Siyuan Feng --- docs/conf.py | 1 + tutorials/autotvm/tune_simple_template.py | 336 ------------------- tutorials/get_started/autotvm_matmul.py | 376 ++++++++++++++++++++++ 3 files changed, 377 insertions(+), 336 deletions(-) delete mode 100644 tutorials/autotvm/tune_simple_template.py create mode 100644 tutorials/get_started/autotvm_matmul.py diff --git a/docs/conf.py b/docs/conf.py index c9c68706998bd..5ed244e585d0e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -212,6 +212,7 @@ "get_started": [ "tvmc_command_line_driver.py", "tensor_expr_get_started.py", + "autotvm_matmul.py", "autoschedule_matmul.py", "cross_compilation_and_rpc.py", "relay_quick_start.py", diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py deleted file mode 100644 index bd2dcf3cfd1e3..0000000000000 --- a/tutorials/autotvm/tune_simple_template.py +++ /dev/null @@ -1,336 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Writing Tunable Templates and Using the Auto-tuner -================================================== -**Author**: `Lianmin Zheng `_ - -This is an introduction tutorial to the auto-tuning module in TVM. - -There are two steps in auto-tuning. -The first step is defining a search space. -The second step is running a search algorithm to explore through this space. -In this tutorial, you can learn how to perform these two steps in TVM. -The whole workflow is illustrated by a matrix multiplication example. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use autotvm package in TVM, we need to install some extra dependencies. -# This step (installing xgboost) can be skipped as it doesn't need XGBoost -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost cloudpickle -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of TVM. In the root directory of TVM, execute -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -import logging -import sys - -import numpy as np -import tvm -from tvm import te -import tvm.testing - -# the module is called `autotvm` -from tvm import autotvm - -###################################################################### -# Step 1: Define the search space -# -------------------------------- -# In this section, we will rewrite a deterministic TVM schedule code to a -# tunable schedule template. You can regard the process of search space definition -# as the parameterization of our existing schedule code. -# -# To begin with, here is how we implement a blocked matrix multiplication in TVM. - -# Matmul V0: Constant tiling factor -def matmul_v0(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - yo, yi = s[C].split(y, 8) - xo, xi = s[C].split(x, 8) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -##################################################################### -# Parametrize the schedule -# ^^^^^^^^^^^^^^^^^^^^^^^^ -# In the previous schedule code, we use a constant "8" as tiling factor. -# However, it might not be the best one because the best tiling factor depends -# on real hardware environment and input shape. -# -# If you want the schedule code to be portable across a wider range of input shapes -# and target hardware, it is better to define a set of candidate values and -# pick the best one according to the measurement results on target hardware. -# -# In autotvm, we can define a tunable parameter, or a "knob" for such kind of value. - -# Matmul V1: List candidate values -@autotvm.template("tutorial/matmul_v1") # 1. use a decorator -def matmul_v1(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - # 2. get the config object - cfg = autotvm.get_config() - - # 3. define search space - cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) - cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) - - # 4. schedule according to config - yo, yi = s[C].split(y, cfg["tile_y"].val) - xo, xi = s[C].split(x, cfg["tile_x"].val) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -############################################################################### -# Here we make four modifications to the previous schedule code and get -# a tunable "template". We can explain the modifications one by one. -# -# 1. Use a decorator to mark this function as a simple template. -# 2. Get a config object: -# You can regard this :code:`cfg` as an argument of this function but -# we obtain it in a different way. With this argument, this function is no longer -# a deterministic schedule code. Instead, we can pass different configurations to -# this function and get different schedules, so this function is a "template". -# -# To make the template function more compact, we do two things in a single function. -# (1) define a search space and (2) schedule according to an entity in this space. -# To achieve this, we make :code:`cfg` be either -# a :any:`ConfigSpace` or a :any:`ConfigEntity` object. -# -# When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and -# build the search space. -# When it is a :any:`ConfigEntity`, it will ignore all space definition API -# (namely, :code:`cfg.define_XXXXX(...)`). Instead, it stores deterministic values for -# all tunable knobs, and we schedule according to these values. -# -# During auto-tuning, we will first call this template with a :any:`ConfigSpace` -# object to build the search space. Then we call this template with different :any:`ConfigEntity` -# in the built space to get different schedules. Finally we will measure the code generated by -# different schedules and pick the best one. -# -# 3. Define two tunable knobs. The first one is :code:`tile_y` with -# 5 possible values. The second one is :code:`tile_x` with a same -# list of possible values. These two knobs are independent, so they -# span a search space with size = 5x5 = 25 -# 4. Schedule according to the deterministic values in :code:`cfg` -# - -##################################################################### -# Use better space definition API -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# In the previous template, we manually list all possible values for a knob. -# This is the lowest level API to define the space. -# However, we also provide another set of API to make the space definition -# easier and smarter. It is recommended to use this set of high level API. -# -# In the following example, we use :any:`ConfigSpace.define_split` to define a split -# knob. It will enumerate all the possible ways to split an axis and construct -# the space. -# -# We also have :any:`ConfigSpace.define_reorder` for reorder knob and -# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization, -# thread binding. -# When the high level API cannot meet your requirement, you can always fall -# back to use low level API. - - -@autotvm.template("tutorial/matmul") -def matmul(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - ##### define space begin ##### - cfg = autotvm.get_config() - cfg.define_split("tile_y", y, num_outputs=2) - cfg.define_split("tile_x", x, num_outputs=2) - ##### define space end ##### - - # schedule according to config - yo, yi = cfg["tile_y"].apply(s, C, y) - xo, xi = cfg["tile_x"].apply(s, C, x) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -###################################################################### -# .. note:: More Explanation on :code:`cfg.defile_split` -# -# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate -# all possible combinations that can split axis y into two axes with factors of the length of y. -# For example, if the length of y is 32 and we want to split it into two axes -# using factors of 32, then there are 6 possible values for -# (length of outer axis, length of inner axis) pair, namely -# (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). -# They are just the 6 possible values of `tile_y`. -# -# During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object. -# We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size` -# (a tuple with two elements). -# In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`. -# Actually, this is equivalent to -# :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])` -# or :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])` -# -# The advantage of using cfg.apply API is that it makes multi-level split -# (when num_outputs >= 3) easier. - -###################################################################### -# Step 2: Search through the space -# --------------------------------- -# In step 1, we build the search space by extending our old schedule code -# into a template. The next step is to pick a tuner and explore in this space. -# -# Auto-tuners in TVM -# ^^^^^^^^^^^^^^^^^^ -# The job for a tuner can be described by following pseudo code -# -# .. code-block:: c -# -# ct = 0 -# while ct < max_number_of_trials: -# propose a batch of configs -# measure this batch of configs on real hardware and get results -# ct += batch_size -# -# When proposing the next batch of configs, the tuner can take different strategies. We -# provide four tuners with different strategies in autotvm. -# -# * :any:`RandomTuner`: Enumerate the space in a random order -# * :any:`GridSearchTuner`: Enumerate the space in a grid search order -# * :any:`GATuner`: Using genetic algorithm to search through the space -# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction. -# -# You can choose the tuner according to the size of your space, your time budget and other factors. -# For example, if your space is very small (less than 1000), a gridsearch tuner or a -# random tuner is good enough. If your space is at the level of 10^9 (this is the space -# size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently -# and find better configs. - -################################################################ -# Begin tuning -# ^^^^^^^^^^^^ -# Here we continue our matrix multiplication example. -# First we should create a tuning task. -# We can also inspect the initialized search space. -# In this case, for a 512x512 square matrix multiplication, the space size -# is 10x10=100 -N, L, M = 512, 512, 512 -task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm") -print(task.config_space) - -################################################################ -# Then we need to define how to measure the generated code and pick a tuner. -# Since our space is small, a random tuner is just okay. -# -# We only make 10 trials in this tutorial for demonstration. In practice, -# you can do more trials according to your time budget. -# We will log the tuning results into a log file. This file can be -# used to get the best config later. - -# logging config (for printing tuning log to the screen) -logging.getLogger("autotvm").setLevel(logging.DEBUG) -logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) - -# There are two steps for measuring a config: build and run. -# By default, we use all CPU cores to compile program. Then measure them sequentially. -# We measure 5 times and take average to reduce variance. -measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5)) - -# Begin tuning with RandomTuner, log records to file `matmul.log` -# You can use alternatives like XGBTuner. -tuner = autotvm.tuner.RandomTuner(task) -tuner.tune( - n_trial=10, - measure_option=measure_option, - callbacks=[autotvm.callback.log_to_file("matmul.log")], -) - -######################################################################### -# Finally we apply history best from the cache file and check its correctness. -# We can call the function :code:`matmul` directly under the -# :any:`autotvm.apply_history_best` context. When we call this function, -# it will query the dispatch context with its argument and get the best config -# with the same argument. - -# apply history best from log file -with autotvm.apply_history_best("matmul.log"): - with tvm.target.Target("llvm"): - s, arg_bufs = matmul(N, L, M, "float32") - func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = a_np.dot(b_np) - -c_tvm = tvm.nd.empty(c_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) diff --git a/tutorials/get_started/autotvm_matmul.py b/tutorials/get_started/autotvm_matmul.py new file mode 100644 index 0000000000000..930e003fe450c --- /dev/null +++ b/tutorials/get_started/autotvm_matmul.py @@ -0,0 +1,376 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Optimizing Operators with Templates and AutoTVM +=============================================== +**Authors**: +`Lianmin Zheng `_, +`Chris Hoge `_ + +In this tutorial, we will now show how the TVM Template Extension (TE) language +can be used to write scheduling templates that can be searched by AutoTVM to +find optimal configurations of scheduling variables. This process is called +Auto-Tuning, and builds on TE to help automate the process of optimizing +operations. + +This tutorial builds on the previous `tutorial on how to write a matrix +multiplication using TE `. + +There are two steps in auto-tuning. + +- The first step is defining a search space. +- The second step is running a search algorithm to explore through this space. + +In this tutorial, you can learn how to perform these two steps in TVM. The whole +workflow is illustrated by a matrix multiplication example. + +.. note:: + Note that this tutorial will not run on Windows or recent versions of macOS. + To get it to run, you will need to wrap the body of this tutorial in a + :code:`if __name__ == "__main__":` block. +""" + +################################################################################ +# Install dependencies +# -------------------- +# To use autotvm package in TVM, we need to install some extra dependencies. +# +# .. code-block:: bash +# +# pip3 install --user psutil xgboost cloudpickle +# +# To make TVM run faster in tuning, it is recommended to use cython as FFI of +# TVM. In the root directory of TVM, execute: +# +# .. code-block:: bash +# +# pip3 install --user cython +# sudo make cython3 +# +# Now return to python code. Begin by importing the required packages. + +import logging +import sys + +import numpy as np +import tvm +from tvm import te +import tvm.testing + +# the module is called `autotvm` +from tvm import autotvm + +################################################################################ +# Basic Matrix Multiplication with TE +# ----------------------------------- +# Recall the basic implementation of matrix multiplication using TE. We write +# it down here with a few changes. We will wrap the multiplication in a python +# function definition. For simplicity, we will focus our attention on a split +# optimization, using a fixed value that defines the block size of the +# reordering. + + +def matmul_basic(N, L, M, dtype): + + a = te.placeholder((n, l), name="a", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + yo, yi = s[C].split(y, 8) + xo, xi = s[C].split(x, 8) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# Matrix Multiplication with AutoTVM +# ---------------------------------- +# In the previous schedule code, we use a constant "8" as the tiling factor. +# However, it might not be the best one because the best tiling factor depends +# on real hardware environment and input shape. +# +# If you want the schedule code to be portable across a wider range of input +# shapes and target hardware, it is better to define a set of candidate values +# and pick the best one according to the measurement results on target +# hardware. +# +# In autotvm, we can define a tunable parameter, or a "knob" for such kind of +# value. + +################################################################################ +# A Basic Matrix Multiplication Template +# -------------------------------------- +# We begin with an example of how to create a tunable parameter set for the +# block size of the `split` scheduling operation. + +# Matmul V1: List candidate values +@autotvm.template("tutorial/matmul_v1") # 1. use a decorator +def matmul_v1(N, L, M, dtype): + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + # 2. get the config object + cfg = autotvm.get_config() + + # 3. define search space + cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) + cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) + + # 4. schedule according to config + yo, yi = s[C].split(y, cfg["tile_y"].val) + xo, xi = s[C].split(x, cfg["tile_x"].val) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# Here we make four modifications to the previous schedule code and get a +# tunable "template". We can explain the modifications one by one. +# +# 1. Use a decorator to mark this function as a simple template. +# 2. Get a config object: You can regard this :code:`cfg` as an argument of +# this function but we obtain it in a different way. With this argument, this +# function is no longer a deterministic schedule. Instead, we can pass +# different configurations to this function and get different schedules. A +# function that uses a configuration object like this is called a "template". +# +# To make the template function more compact, we can do two things to define +# the parameter search space within a single function. +# +# 1. Define a search space across a set values. This is done by making +# :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the +# tunable knobs in this function and build a search space from it. +# 2. Schedule according to an entity in this space. This is done by making +# :code:`cfg` a :any:`ConfigEntity` object. When it is a +# :any:`ConfigEntity`, it will ignore all space definition API (namely, +# :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic +# values for all tunable knobs, and we schedule according to these values. +# +# During auto-tuning, we will first call this template with a +# :any:`ConfigSpace` object to build the search space. Then we call this +# template with different :any:`ConfigEntity` in the built space to get +# different schedules. Finally we will measure the code generated by +# different schedules and pick the best one. +# +# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible +# values. The second one is :code:`tile_x` with a same list of possible values. +# These two knobs are independent, so they span a search space with size 25 = +# 5x5. +# 4. The configuration knobs are passed to the :code:`split` schedule +# operation, allowing us to schedule according to the 5x5 deterministic values +# we previously defined in :code:`cfg`. + +################################################################################ +# A Matrix Multiplication Template with the Advanced Parameter API +# ---------------------------------------------------------------- +# In the previous template, we manually listed all of the possible values for a +# knob. This is the lowest level API to define the space, and gives an explicit +# enumeration of the parameter space to search. However, we also provide +# another set of APIs that can make the definition of the search space easier +# and smarter. Where possible, we receomment you use this higher-level API +# +# In the following example, we use :any:`ConfigSpace.define_split` to define a +# split knob. It will enumerate all the possible ways to split an axis and +# construct the space. +# +# We also have :any:`ConfigSpace.define_reorder` for reorder knob and +# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization, +# thread binding. When the high level API cannot meet your requirements, you +# can always fall back to using the low level API. + + +@autotvm.template("tutorial/matmul") +def matmul(N, L, M, dtype): + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + ##### define space begin ##### + cfg = autotvm.get_config() + cfg.define_split("tile_y", y, num_outputs=2) + cfg.define_split("tile_x", x, num_outputs=2) + ##### define space end ##### + + # schedule according to config + yo, yi = cfg["tile_y"].apply(s, C, y) + xo, xi = cfg["tile_x"].apply(s, C, x) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# .. note:: More Explanation on :code:`cfg.define_split` +# +# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will +# enumerate all possible combinations that can split axis y into two axes with +# factors of the length of y. For example, if the length of y is 32 and we +# want to split it into two axes using factors of 32, then there are 6 +# possible values for (length of outer axis, length of inner axis) pair, +# namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6 +# possible values of `tile_y`. +# +# During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object. +# We stores the lengths of outer axes and inner axes in +# :code:`cfg['tile_y'].size` (a tuple with two elements). In this template, +# we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`. +# Actually, this is equivalent to :code:`yo, yi = s[C].split(y, +# cfg["tile_y"].size[1])` or :code:`yo, yi = s[C].split(y, +# nparts=cfg['tile_y"].size[0])` +# +# The advantage of using cfg.apply API is that it makes multi-level splits +# (that is, when num_outputs >= 3) easier. + +################################################################################ +# Step 2: Use AutoTVM to Optimize the Matrix Multiplication +# --------------------------------------------------------- +# In Step 1, we wrote a matrix multiplication template that allowed us to +# paramaterize the block size used in the `split` schedule. We can now conduct +# a search over this parameter space. The next step is to pick a tuner to guide +# the exploration of this space. +# +# Auto-tuners in TVM +# ~~~~~~~~~~~~~~~~~~ +# The job for a tuner can be described by following pseudo code +# +# .. code-block:: c +# +# ct = 0 +# while ct < max_number_of_trials: +# propose a batch of configs +# measure this batch of configs on real hardware and get results +# ct += batch_size +# +# When proposing the next batch of configs, the tuner can take different +# strategies. Some of the tuner strategies provided by TVM include: +# +# * :any:`RandomTuner`: Enumerate the space in a random order +# * :any:`GridSearchTuner`: Enumerate the space in a grid search order +# * :any:`GATuner`: Using genetic algorithm to search through the space +# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to +# predict the speed of lowered IR and pick the next batch according to the +# prediction. +# +# You can choose the tuner according to the size of your space, your time +# budget and other factors. For example, if your space is very small (less +# than 1000), a gridsearch tuner or a random tuner is good enough. If your +# space is at the level of 10^9 (this is the space size of a conv2d operator on +# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs. + +################################################################################ +# Begin tuning +# ~~~~~~~~~~~~ +# Here we continue our matrix multiplication example. First we create a tuning +# task. We can also inspect the initialized search space. In this case, for a +# 512x512 square matrix multiplication, the space size is 10x10=100 Note that +# the task and search space are independent of the tuner picked. + +N, L, M = 512, 512, 512 +task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm") +print(task.config_space) + +################################################################################ +# Then we need to define how to measure the generated code and pick a tuner. +# Since our space is small, a random tuner is just okay. +# +# We only make 10 trials in this tutorial for demonstration. In practice, you +# can do more trials according to your time budget. We will log the tuning +# results into a log file. This file can be used to choose the best +# configuration discovered by the tuner later. + +# logging config (for printing tuning log to the screen) +logging.getLogger("autotvm").setLevel(logging.DEBUG) +logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) + +################################################################################ +# There are two steps for measuring a config: build and run. By default, we use +# all CPU cores to compile program. We then measure them sequentially. To help +# reduce variance, we take 5 measurements and average them. +measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5)) + +# Begin tuning with RandomTuner, log records to file `matmul.log` +# You can use alternatives like XGBTuner. +tuner = autotvm.tuner.RandomTuner(task) +tuner.tune( + n_trial=10, + measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file("matmul.log")], +) + +################################################################################ +# With tuning completed, we can choose the configuration from the log file that +# has the best measured performance and compile the schedule with the +# corresponding parameters. We also do a quick verfication that the schedule is +# producing correct answers. We can call the function :code:`matmul` directly +# under the :any:`autotvm.apply_history_best` context. When we call this +# function, it will query the dispatch context with its argument and get the +# best config with the same argument. + +# apply history best from log file +with autotvm.apply_history_best("matmul.log"): + with tvm.target.Target("llvm"): + s, arg_bufs = matmul(N, L, M, "float32") + func = tvm.build(s, arg_bufs) + +# check correctness +a_np = np.random.uniform(size=(N, L)).astype(np.float32) +b_np = np.random.uniform(size=(L, M)).astype(np.float32) +c_np = a_np.dot(b_np) + +c_tvm = tvm.nd.empty(c_np.shape) +func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) + +tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-4) + +################################################################################ +# Final Notes and Summary +# ----------------------- +# In this tutorial, we have shown how to build operator templates that allow +# TVM to search a parameter space and choose optimized schedule configurations. +# To gain a deeper understanding of how this works, we recommend expanding on +# this example by adding new search parameters to the schedule based on +# schedule operations demonstated in the `Getting Started With Tensor +# Expressions _` tutorial In the upcoming sections, we +# will demonstate the AutoScheduler, a method for TVM to optimize common +# operators without the need for the user to provide a user-defined template. From 474bc4e761e3bc87d69419c376042c19b6b7dbbe Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 26 Mar 2021 20:30:29 -0400 Subject: [PATCH 75/84] [RUNTIME][WEB] Cleanup logging for web runtime. (#7750) * [RUNTIME][WEB] Cleanup logging for web runtime. The log(info) won't work for web runtime due to the usage of time function. - Introduce TVM_LOG_CUSTOMIZE anf TVM_LOG_STACK_TRACE. - Reorganize the log customization code to wasm_runtime(so non-gpu usecase can apply). - Update the testcase to cover the logging. * Fix windows build and address comment. --- Makefile | 2 +- .../app/src/main/jni/Application.mk | 2 +- .../app/src/main/jni/Application.mk | 2 +- .../app/src/main/jni/Application.mk | 2 +- include/tvm/runtime/logging.h | 101 +++++++++++++----- src/runtime/logging.cc | 19 +++- web/apps/node/example.js | 3 +- web/emcc/tvmjs_support.cc | 8 +- web/emcc/wasm_runtime.cc | 30 ++++-- web/emcc/webgpu_runtime.cc | 18 +--- web/tests/node/test_packed_func.js | 6 ++ 11 files changed, 127 insertions(+), 66 deletions(-) diff --git a/Makefile b/Makefile index 011dc5c31bf5b..c143c69d0a2c0 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ crttest: @mkdir -p build && cd build && cmake .. && $(MAKE) crttest # EMCC; Web related scripts -EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\ +EMCC_FLAGS= -std=c++11\ -Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\ -s TOTAL_MEMORY=1073741824\ -s EXTRA_EXPORTED_RUNTIME_METHODS="['addFunction','cwrap','getValue','setValue']"\ diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk index b6124a86df873..6ac3271f49ac5 100644 --- a/apps/android_camera/app/src/main/jni/Application.mk +++ b/apps/android_camera/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= all APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk index a50a40bf5cd19..220c6af3bc7e5 100644 --- a/apps/android_deploy/app/src/main/jni/Application.mk +++ b/apps/android_deploy/app/src/main/jni/Application.mk @@ -27,7 +27,7 @@ include $(config) APP_STL := c++_static -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 5f885f1c6f144..e3078906ff552 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h index c5986a9baa151..08c6ff61252bd 100644 --- a/include/tvm/runtime/logging.h +++ b/include/tvm/runtime/logging.h @@ -47,6 +47,17 @@ #define TVM_THROW_EXCEPTION noexcept(false) #endif +/*! + * \brief Whether or not enable backtrace logging during a + * fatal error. + * + * \note TVM won't depend on LIBBACKTRACE or other exec_info + * library when this option is disabled. + */ +#ifndef TVM_LOG_STACK_TRACE +#define TVM_LOG_STACK_TRACE 1 +#endif + /*! * \brief Whether or not use libbacktrace library * for getting backtrace information @@ -55,6 +66,15 @@ #define TVM_USE_LIBBACKTRACE 0 #endif +/*! + * \brief Whether or not customize the logging output. + * If log customize is enabled, the user must implement + * tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl. + */ +#ifndef TVM_LOG_CUSTOMIZE +#define TVM_LOG_CUSTOMIZE 0 +#endif + // a technique that enables overriding macro names on the number of parameters. This is used // to define other macros below #define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME @@ -152,13 +172,15 @@ TVM_DLL std::string Backtrace(); /*! \brief Base error type for TVM. Wraps a string message. */ class Error : public ::dmlc::Error { // for backwards compatibility public: - /*! \brief Construct an error. + /*! + * \brief Construct an error. * \param s The message to be displayed with the error. */ explicit Error(const std::string& s) : ::dmlc::Error(s) {} }; -/*! \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error +/*! + * \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error * contains a backtrace of where it occured. */ class InternalError : public Error { @@ -214,23 +236,32 @@ class InternalError : public Error { std::string full_message_; // holds the full error string }; +/*! \brief Internal implementation */ namespace detail { -#ifndef TVM_LOG_CUSTOMIZE +// Provide support for customized logging. +#if TVM_LOG_CUSTOMIZE +/*! + * \brief Custom implementations of LogFatal. + * + * \sa TVM_LOG_CUSTOMIZE + */ +TVM_DLL void LogFatalImpl(const std::string& file, int lineno, const std::string& message); + +/*! + * \brief Custom implementations of LogMessage. + * + * \sa TVM_LOG_CUSTOMIZE + */ +TVM_DLL void LogMessageImpl(const std::string& file, int lineno, const std::string& message); -/*! \brief Class to accumulate an error message and throw it. Do not use +/*! + * \brief Class to accumulate an error message and throw it. Do not use * directly, instead use LOG(FATAL). */ class LogFatal { public: LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} -#ifdef _MSC_VER -#pragma disagnostic push -#pragma warning(disable : 4722) -#endif - ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); } -#ifdef _MSC_VER -#pragma disagnostic pop -#endif + ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); } std::ostringstream& stream() { return stream_; } private: @@ -239,30 +270,39 @@ class LogFatal { int lineno_; }; -/*! \brief Class to accumulate an log message. Do not use directly, instead use +/*! + * \brief Class to accumulate an log message. Do not use directly, instead use * LOG(INFO), LOG(WARNING), LOG(ERROR). */ class LogMessage { public: - LogMessage(const std::string& file, int lineno) { - std::time_t t = std::time(nullptr); - stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno - << ": "; - } - ~LogMessage() { std::cerr << stream_.str() << std::endl; } + LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); } std::ostringstream& stream() { return stream_; } private: + std::string file_; + int lineno_; std::ostringstream stream_; }; + #else -// Custom implementations of LogFatal and LogMessage that allow the user to -// override handling of the message. The user must implement LogFatalImpl and LogMessageImpl -void LogFatalImpl(const std::string& file, int lineno, const std::string& message); + +/*! + * \brief Class to accumulate an error message and throw it. Do not use + * directly, instead use LOG(FATAL). + */ class LogFatal { public: LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} - ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); } +#ifdef _MSC_VER +#pragma disagnostic push +#pragma warning(disable : 4722) +#endif + ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); } +#ifdef _MSC_VER +#pragma disagnostic pop +#endif std::ostringstream& stream() { return stream_; } private: @@ -271,16 +311,21 @@ class LogFatal { int lineno_; }; -void LogMessageImpl(const std::string& file, int lineno, const std::string& message); +/*! + * \brief Class to accumulate an log message. Do not use directly, instead use + * LOG(INFO), LOG(WARNING), LOG(ERROR). + */ class LogMessage { public: - LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} - ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); } + LogMessage(const std::string& file, int lineno) { + std::time_t t = std::time(nullptr); + stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno + << ": "; + } + ~LogMessage() { std::cerr << stream_.str() << std::endl; } std::ostringstream& stream() { return stream_; } private: - std::string file_; - int lineno_; std::ostringstream stream_; }; #endif diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc index 0d8577a4ccdf8..227814bcddd48 100644 --- a/src/runtime/logging.cc +++ b/src/runtime/logging.cc @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ +#include + +#include +#if TVM_LOG_STACK_TRACE #if TVM_USE_LIBBACKTRACE #include #include -#include #include #include #include -#include #include namespace tvm { @@ -141,8 +143,6 @@ std::string Backtrace() { #include -#include - namespace tvm { namespace runtime { // Fallback to the dmlc implementation when backtrace is not available. @@ -150,4 +150,13 @@ std::string Backtrace() { return dmlc::StackTrace(); } } // namespace runtime } // namespace tvm -#endif +#endif // TVM_USE_LIBBACKTRACE +#else + +namespace tvm { +namespace runtime { +// stacktrace logging is completely disabled +std::string Backtrace() { return ""; } +} // namespace runtime +} // namespace tvm +#endif // TVM_LOG_STACK_TRACE diff --git a/web/apps/node/example.js b/web/apps/node/example.js index f81a9c903e5d8..cff76d8a067ed 100644 --- a/web/apps/node/example.js +++ b/web/apps/node/example.js @@ -31,7 +31,8 @@ const wasmSource = fs.readFileSync(path.join(wasmPath, "tvmjs_runtime.wasm")); // the async version of the API. tvmjs.instantiate(wasmSource, new EmccWASI()) .then((tvm) => { + const log_info = tvm.getGlobalFunc("testing.log_info_str"); + log_info("hello world"); // List all the global functions from the runtime. console.log("Runtime functions using EmccWASI\n", tvm.listGlobalFuncNames()); }); - diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 6f4d2e5932daf..77ce6be66e630 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -25,14 +25,10 @@ */ // configurations for tvm logging -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_STACK_TRACE 0 #define TVM_LOG_DEBUG 0 +#define TVM_LOG_CUSTOMIZE 1 #define DMLC_USE_LOGGING_LIBRARY -#define TVM_USE_LIBBACKTRACE 0 #include #include diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index 287604e110a9b..ba880e7b5c0ae 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -23,14 +23,10 @@ */ // configurations for tvm logging -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_STACK_TRACE 0 #define TVM_LOG_DEBUG 0 +#define TVM_LOG_CUSTOMIZE 1 #define DMLC_USE_LOGGING_LIBRARY -#define TVM_USE_LIBBACKTRACE 0 #include #include @@ -40,9 +36,11 @@ #include "src/runtime/file_utils.cc" #include "src/runtime/graph/graph_runtime.cc" #include "src/runtime/library_module.cc" +#include "src/runtime/logging.cc" #include "src/runtime/module.cc" #include "src/runtime/ndarray.cc" #include "src/runtime/object.cc" +#include "src/runtime/profiling.cc" #include "src/runtime/registry.cc" #include "src/runtime/rpc/rpc_channel.cc" #include "src/runtime/rpc/rpc_endpoint.cc" @@ -67,11 +65,31 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { return 0 // --- Environment PackedFuncs for testing --- namespace tvm { namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << "[FATAL] " << file << ":" << lineno << ": " << message << std::endl; + abort(); +} + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::cout << "[INFO] " << file << ":" << lineno << ": " << message << std::endl; +} + +} // namespace detail TVM_REGISTER_GLOBAL("testing.echo").set_body([](TVMArgs args, TVMRetValue* ret) { *ret = args[0]; }); +TVM_REGISTER_GLOBAL("testing.log_info_str").set_body([](TVMArgs args, TVMRetValue* ret) { + LOG(INFO) << args[0].operator String(); +}); + +TVM_REGISTER_GLOBAL("testing.log_fatal_str").set_body([](TVMArgs args, TVMRetValue* ret) { + LOG(FATAL) << args[0].operator String(); +}); + TVM_REGISTER_GLOBAL("testing.add_one").set_body_typed([](int x) { return x + 1; }); TVM_REGISTER_GLOBAL("testing.wrap_callback").set_body([](TVMArgs args, TVMRetValue* ret) { diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index 3dc8be251f99c..073c613bd2c22 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -23,12 +23,10 @@ */ // configurations for tvm logging +#define TVM_LOG_STACK_TRACE 0 #define TVM_LOG_DEBUG 0 +#define TVM_LOG_CUSTOMIZE 1 #define DMLC_USE_LOGGING_LIBRARY -#define TVM_BACKTRACE_DISABLED 1 -#define TVM_LOG_DEBUG 0 -#define DMLC_USE_LOGGING_LIBRARY -#define TVM_USE_LIBBACKTRACE 0 #include #include @@ -45,18 +43,6 @@ namespace tvm { namespace runtime { -namespace detail { -// Override logging mechanism -void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { - std::cerr << file << ":" << lineno << ": " << message << std::endl; - abort(); -} - -void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { - std::cerr << file << ":" << lineno << ": " << message << std::endl; -} - -} // namespace detail /*! \brief Thread local workspace */ class WebGPUThreadEntry { diff --git a/web/tests/node/test_packed_func.js b/web/tests/node/test_packed_func.js index 87b48df3d67a4..6e0546f39df15 100644 --- a/web/tests/node/test_packed_func.js +++ b/web/tests/node/test_packed_func.js @@ -122,3 +122,9 @@ test("NDArrayCbArg", () => { fcheck(x); assert(use_count(x) == 1); }); + +test("Logging", () => { + const log_info = tvm.getGlobalFunc("testing.log_info_str"); + log_info("helow world") + log_info.dispose(); +}); From dc81767f9e59fe8ed832fd9b52ed39ac61e869e6 Mon Sep 17 00:00:00 2001 From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Date: Sat, 27 Mar 2021 20:26:05 +0800 Subject: [PATCH 76/84] [ARITH] detect iter affine map with predicate (#7752) --- include/tvm/arith/analyzer.h | 10 + include/tvm/arith/iter_affine_map.h | 5 +- include/tvm/tir/analysis.h | 6 + python/tvm/arith/iter_affine_map.py | 14 +- src/arith/analyzer.cc | 7 + src/arith/expr_complexity.cc | 53 ++ src/arith/iter_affine_map.cc | 453 ++++++++++++++---- src/arith/solve_linear_inequality.cc | 51 +- .../unittest/test_arith_iter_affine_map.py | 104 +++- 9 files changed, 547 insertions(+), 156 deletions(-) create mode 100644 src/arith/expr_complexity.cc diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h index cd20bdcf4d1a8..adb037bfd0507 100644 --- a/include/tvm/arith/analyzer.h +++ b/include/tvm/arith/analyzer.h @@ -458,6 +458,16 @@ class TVM_DLL Analyzer { * \note Analyzer will call into sub-analyzers to get the result. */ bool CanProveLess(const PrimExpr& expr, int64_t upper_bound); + /*! + * \brief Whether can we prove lhs == rhs. + * + * \param lhs The input lhs. + * \param rhs The input rhs. + * \return Whether we can prove lhs == rhs. + * + * \note Analyzer will call into sub-analyzers to get the result. + */ + bool CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs); /*! * \brief Whether can we prove condition. * diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h index e2e081d2be89c..f786c013443ca 100644 --- a/include/tvm/arith/iter_affine_map.h +++ b/include/tvm/arith/iter_affine_map.h @@ -136,6 +136,7 @@ class IterMark : public ObjectRef { TVM_DLL IterMark(PrimExpr source, PrimExpr extent); TVM_DEFINE_OBJECT_REF_METHODS(IterMark, ObjectRef, IterMarkNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(IterMarkNode); }; /*! @@ -259,7 +260,6 @@ class IterSumExpr : public IterMapExpr { /*! * \brief Detect if indices can be written as - * * [y_0 + c_0, y_1 + c_1, ..., y_n + c_n] * * Here y = some-quasi-affine-iter-map(input_iters) @@ -272,12 +272,15 @@ class IterSumExpr : public IterMapExpr { * * \param indices The indices to detect pattern for. * \param input_iters Map from variable to iterator's range. + * \param predicate The predicate constraints on the input iterators + * \param require_bijective A boolean flag that indicates whether the mapping should be bijective. * \param analyzer Analyzer used to get context information. * * \return The detected pattern if a match exists, * otherwise return an empty array. */ Array DetectIterMap(const Array& indices, const Map& input_iters, + const PrimExpr& predicate, bool require_bijective, arith::Analyzer* analyzer); } // namespace arith diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h index 1692a8cdacf31..250a84e782a2c 100644 --- a/include/tvm/tir/analysis.h +++ b/include/tvm/tir/analysis.h @@ -172,6 +172,12 @@ TVM_DLL bool VerifyGPUCode(const PrimFunc& func, Map constrain Array> GetBlockAccessRegion(const Block& block, const Map& buffer_var_map); +/*! + * \brief Calculate the expresion complexity based on number of symbols it contains. + * \param expr The expr to be calculated. + */ +TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr); + // Pass variants of verification analysis // directly throws RuntimeError when verification fails. namespace transform { diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py index 123d9b85480a8..4033d797dff89 100644 --- a/python/tvm/arith/iter_affine_map.py +++ b/python/tvm/arith/iter_affine_map.py @@ -88,21 +88,27 @@ def __init__(self, args, base): self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base) -def detect_iter_map(indices, input_iters): - """Detect if indices can be written mapped iters from input_iters. +def detect_iter_map(indices, input_iters, predicate=True, require_bijective=False): + """Detect if indices can be written as mapped iters from input iters Parameters ---------- indices : List[PrimExpr] - The input indices. + The input indices input_iters : Map[Var, Range] The domain of each input iterators. + predicate : PrimExpr + The predicate constraints on the input iterators + + require_bijective : bool + A boolean flag that indicates whether the mapping should be bijective + Returns ------- results : List[IterSumExpr] The iter map matching result. Empty array if no match can be found. """ - return _ffi_api.DetectIterMap(indices, input_iters) + return _ffi_api.DetectIterMap(indices, input_iters, predicate, require_bijective) diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc index 9737b53703fd6..08e32f576299d 100644 --- a/src/arith/analyzer.cc +++ b/src/arith/analyzer.cc @@ -100,6 +100,13 @@ bool Analyzer::CanProveLess(const PrimExpr& expr, int64_t upper_bound) { return false; } +bool Analyzer::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) { + const auto* clhs = lhs.as(); + const auto* crhs = rhs.as(); + if (clhs && crhs) return clhs->value == crhs->value; + return CanProve(lhs - rhs == 0); +} + bool Analyzer::CanProve(const PrimExpr& expr) { if (const auto* ptr = expr.as()) { return ptr->value != 0; diff --git a/src/arith/expr_complexity.cc b/src/arith/expr_complexity.cc new file mode 100644 index 0000000000000..e809668bb624d --- /dev/null +++ b/src/arith/expr_complexity.cc @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tir/analysis/expr_complexity.cc + * \brief Calculate expr complexity. + */ +#include +#include + +namespace tvm { +namespace tir { + +/*! \brief Count the size of the PrimExpr. */ +class PrimExprSizeCounter : public ExprVisitor { + public: + PrimExprSizeCounter() = default; + + static size_t Count(const PrimExpr& expr) { + PrimExprSizeCounter prim_expr_size_counter; + prim_expr_size_counter.VisitExpr(expr); + return prim_expr_size_counter.counter_; + } + + private: + void VisitExpr(const PrimExpr& expr) final { + counter_++; + ExprVisitor::VisitExpr(expr); + } + + size_t counter_{0}; +}; + +size_t CalculateExprComplexity(const PrimExpr& expr) { return PrimExprSizeCounter::Count(expr); } + +} // namespace tir +} // namespace tvm diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc index 7efdd03fa11eb..3757b5eb0d51e 100644 --- a/src/arith/iter_affine_map.cc +++ b/src/arith/iter_affine_map.cc @@ -29,6 +29,7 @@ #include "../support/utils.h" #include "const_fold.h" +#include "pattern_match.h" namespace tvm { namespace arith { @@ -123,11 +124,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) }); /*! - * \brief Collector that collects - * the outgoing split reference of each IterMark. + * \brief Collector that collects the outgoing split reference of each IterMark. * - * These out-going splits can then be used to - * check if the iterators are independent. + * These out-going splits can then be used to check if the iterators are independent. */ class IterMarkSplitCollector { public: @@ -161,8 +160,7 @@ class IterMarkSplitCollector { } }; -// Rewriter to rewrite PrimExpr to IterMapExpr -// when possible +/*! \brief Rewriter to rewrite PrimExpr to IterMapExpr when possible */ class IterMapRewriter : public ExprMutator { public: using Parent = ExprMutator; @@ -170,16 +168,19 @@ class IterMapRewriter : public ExprMutator { explicit IterMapRewriter(Analyzer* analyzer, const Map& input_iters) : analyzer_(analyzer) { for (auto kv : input_iters) { - const auto& vrng = kv.second; - if (is_zero(vrng->min)) { - IterMark mark(kv.first, vrng->extent); - var_map_[kv.first] = IterSplitExpr(mark); + const Var& var = kv.first; + const Range& vrng = kv.second; + if (is_one(vrng->extent)) { + var_map_[var] = IterSumExpr({}, vrng->min); + } else if (is_zero(vrng->min)) { + IterMark mark(var, vrng->extent); + var_map_[var] = IterSplitExpr(mark); input_marks_.push_back(mark); } else { - IterMark mark(kv.first - vrng->min, vrng->extent); - auto sum_expr = ToIterSumExpr(IterSplitExpr(mark)); + IterMark mark(var - vrng->min, vrng->extent); + IterSumExpr sum_expr = ToIterSumExpr(IterSplitExpr(mark)); sum_expr.CopyOnWrite()->base = vrng->min; - var_map_[kv.first] = sum_expr; + var_map_[var] = sum_expr; input_marks_.push_back(mark); } } @@ -187,33 +188,88 @@ class IterMapRewriter : public ExprMutator { size_t unresolved_count() const { return unresolved_count_; } - IterSumExpr Rewrite(PrimExpr expr) { + IterSumExpr Rewrite(const PrimExpr& expr) { return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr))); } - bool CheckBijective(const Array& indices) { - // This function checks two conditions: - // - C0: Each iter mark should be fully covered by non-overlapping splits. - // - C1: All of the input iterators are used. - // - // Example: given x in [0, 8) y in [0, 6) - // - indices = [x, x+1, y] won't pass because x and x+1 contribute - // two splits that overlaps with each other. - // - indices = [x / 4, x % 4, y] will pass because x / 4 and x % 4 - // contribute two non-overlapping splits that covers x. - // - indices = [x / 4, x % 4] won't pass because y is not used. - // + IterSumExpr RewriteIterConstraint(const PrimExpr& expr, + const PrimExpr& predicate_induced_extent) { + return NormalizeToIterOnBoundExpr(ToIterSumExpr(DirectMutate(expr)), predicate_induced_extent); + } + + /*! + * \brief If require_bijective is true, this function checks two conditions: + * - C0: Each iter mark should be fully covered by non-overlapping splits. + * - C1: All of the input iterators are used. + * Example: given x in [0, 8) y in [0, 6) + * - bindings = [x, x + 1, y] won't pass because x and x+1 contribute + * two splits that overlaps with each other. + * - bindings = [x / 4, x % 4, y] will pass because x / 4 and x % 4 + * contribute two non-overlapping splits that covers x. + * - bindings = [x / 4, x % 4] won't pass because y is not used. + * + * If require_bijective is false, this function checks one condition: + * - C0: Each iter mark has a chance to be fully covered by non-overlapping splits. + * Example: given x in [0, 8) y in [0, 6) + * - bindings = [x / 4] will pass because x / 4 can be one split of x + * - bindings = [x / 4, x % 4] will pass because x / 4 and x % 4 + * contribute two non-overlapping splits that covers x. + * - bindings = [x / 3] will not pass because x / 3 can not be one split of x + * \return whether the bindings are valid + */ + bool CheckMapping(const Array& bindings, bool require_bijective) { IterMarkSplitCollector collector; // We can check that for each iter mark: - // All the splits that refers to the itermark covers its extent. + // All the splits that refers to the iter_mark covers its extent. // The splits do not overlap with each other. - collector.Collect(indices); + collector.Collect(bindings); for (const IterMark& mark : collector.visited_) { - if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).empty()) return false; + if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty()) + return false; } - // all input marks must be visited - for (const auto& mark : input_marks_) { - if (collector.visited_.count(mark) == 0) return false; + if (require_bijective) { + // all input marks must be visited + for (const IterMark& mark : input_marks_) { + if (collector.visited_.count(mark) == 0) return false; + } + } + return true; + } + + /*! + * \brief Check the validity of iterator constraints + * The flattened forms of two different iterator constraints + * either 1) follow inclusion relation or 2) have no intersection + * + * For Example, x = i0*30 + i1*15 + i2*3 + i3, + * 1) [i0*2 + i1 < 3, i2*3 + i3 < 5] is valid, since {i0, i1} \intersect {i2, i3} = empty set. + * 2) [i0*2 + i1 < 3, i1*5 + i2 < 5] is not valid, + * since {i0, i1} \intersect {i1, i2} = {i1}, i0 \in {i0, i1}, i0 \notin {i1, i2} + * \return whether the predicates are valid; + */ + bool CheckConstraints() const { + // the constrained_iters_flattened_ are in the order of shorter to longer + // since we visit the predicates in the order of size + for (size_t i = 0; i < constrained_iters_flattened_.size(); ++i) { + for (size_t j = i + 1; j < constrained_iters_flattened_.size(); ++j) { + // state: 0(start), -1(no intersection), 1(inclusion) + int state = 0; + for (const IterSplitExpr& arg1 : constrained_iters_flattened_[i]->args) { + bool found = false; + for (const IterSplitExpr& arg2 : constrained_iters_flattened_[j]->args) { + if (IterSplitEqual(arg1, arg2)) { + found = true; + break; + } + } + // Check either it is inclusion or intersection, but not both + if (state == 0) { + state = found ? 1 : -1; + } else if ((state == -1 && found) || (state == 1 && !found)) { + return false; + } + } + } } return true; } @@ -243,25 +299,30 @@ class IterMapRewriter : public ExprMutator { size_t operator()(const IterSumExpr& value) const { // for now only hash on source index. size_t hash = value->args.size(); - for (const auto& arg : value->args) { + for (const IterSplitExpr& arg : value->args) { hash = support::HashCombine(hash, std::hash()(arg->source.get())); } return hash; } }; + static bool IterSplitEqual(const IterSplitExpr& lhs, const IterSplitExpr& rhs, + bool check_scale = true) { + tir::ExprDeepEqual equal; + if (!lhs->source.same_as(rhs->source)) return false; + if (!equal(lhs->lower_factor, rhs->lower_factor)) return false; + if (check_scale && !equal(lhs->scale, rhs->scale)) return false; + if (!equal(lhs->extent, rhs->extent)) return false; + return true; + } + struct IterSumEqual { bool operator()(const IterSumExpr& lhs, const IterSumExpr& rhs) const { tir::ExprDeepEqual equal; if (lhs->args.size() != rhs->args.size()) return false; if (!equal(lhs->base, rhs->base)) return false; for (size_t i = 0; i < lhs->args.size(); ++i) { - auto lvalue = lhs->args[i]; - auto rvalue = rhs->args[i]; - if (!lvalue->source.same_as(rvalue->source)) return false; - if (!equal(lvalue->lower_factor, rvalue->lower_factor)) return false; - if (!equal(lvalue->scale, rvalue->scale)) return false; - if (!equal(lvalue->extent, rvalue->extent)) return false; + if (!IterSplitEqual(lhs->args[i], rhs->args[i])) return false; } return true; } @@ -275,19 +336,64 @@ class IterMapRewriter : public ExprMutator { std::unordered_map var_map_; // input iter marks std::vector input_marks_; - // The canonical map for sum - std::unordered_map sum_fuse_map_; + // The map for sum that maps flattened form to IterMark with normal form and extent + // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2) + // predicate: j*2 + k < 9 + // Then, flattened form = IterSum(IterSplit(i, scale=9), + // IterSplit(j, scale=2), + // IterSplit(k, scale=1)) + // normal form = IterSum(IterSplit(i, scale=9), + // IterSplit(IterMark(IterSum(IterSplit(j, scale=2), + // IterSplit(k, scale=1)), + // extent=9) + // scale=1)) + std::unordered_map sum_fuse_map_; + // The map for sum that maps normal form to flattened form + std::unordered_map flattened_map_; + // The flattened forms of constrained iters + std::vector constrained_iters_flattened_; /*! - * \brief Verify that splits fully covers mark in a non-overlapping fashion. - * If verification passes, return splits from outermost to inner most order. - * If not, return an empty array + * \brief Look for a split in splits that is not used such that its lower_factor is smallest. + * Note that here we use division to compare lower_factor. + * \param splits the split array to search in. + * \param used the input used array. + * \param expected_lower_factor the skipped lower factor. + * \return the index of the expected split, split.size() if not found. + */ + size_t SearchSkipLowerFactor(const std::vector& splits, + const std::vector& used, + const PrimExpr& expected_lower_factor) { + size_t res = splits.size(); + for (size_t i = 0; i < splits.size(); ++i) { + if (used[i]) continue; + if (!used[i] && !CanProveDivisible(splits[i]->lower_factor, expected_lower_factor)) { + // all the remaining unused splits should have their lower factor divisible + return splits.size(); + } + if (res == splits.size() || + CanProveDivisible(splits[res]->lower_factor, splits[i]->lower_factor)) { + // note down the split with smaller lower factor + res = i; + } + } + return res; + } + + /*! + * \brief If bijective is required, verify that splits fully covers mark in a non-overlapping + * fashion, If not, verify that splits are valid and compatible for the mark. + * If verification passes, return splits from outermost to innermost order. + * If not, return an empty array. * \param mark The iterator of interest. * \param splits The splits to be verified. + * \param require_bijective A boolean flag that indicates whether the bindings should be + * bijective. * \return The normalized splits. */ Array TryNormalizeSplits(const IterMark& mark, - const std::vector& splits) { + const std::vector& splits, + bool require_bijective) { std::vector used(splits.size(), false); std::vector iters; PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1); @@ -296,31 +402,83 @@ class IterMapRewriter : public ExprMutator { size_t j = 0; for (; j < splits.size(); ++j) { if (used[j]) continue; - if (!used[j] && CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) break; + if (!used[j] && analyzer_->CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) + break; } if (j == splits.size()) { - return Array(); + // we do not allow incomplete split if the bindings should be bijective + if (require_bijective) return Array(); + // look for the next split skipping this lower factor + // For example, y \in [0, 24) has 3 splits [y / 6, (y / 2) % 6, y % 2] + // It is valid to only have [y / 6, y % 2] if bijective is not required + // We can skip (y / 2) % 6 + j = SearchSkipLowerFactor(splits, used, expected_lower_factor); + // split not found + if (j == splits.size()) return Array(); } used[j] = true; iters.push_back(splits[j]); - expected_lower_factor *= splits[j]->extent; + expected_lower_factor = splits[j]->lower_factor * splits[j]->extent; + } + // Case 1. bijective is required. + // We check the extent we calculate is consistent with the extent of the mark + // Case 2. bijective is not required. + // We check the extent we calculate is a factor of the extent of the mark + // For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not. + if ((require_bijective && !analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) || + (!require_bijective && !CanProveDivisible(mark->extent, expected_lower_factor))) { + return Array(); } - if (!CanProveEqual(expected_lower_factor, mark->extent)) return Array(); return Array(iters.rbegin(), iters.rend()); } + /*! + * \brief Normalize the left hand side of iter constraint(expr < predicate_induced_extent) + * \param expr The left hand side of iter constraint. + * \param predicate_induced_extent Extent from iter constraint. + * \return The Normalized expression. + */ + IterSumExpr NormalizeToIterOnBoundExpr(IterSumExpr expr, + const PrimExpr& predicate_induced_extent) { + // We are normalizing the left hand side of iter constraint(iter < predicate_induced_extent) + Optional opt = TryFuseIters(expr); + // scale should be 1 + if (opt.defined() && is_one(opt.value()->scale)) { + IterSumExpr sum = Downcast(opt.value()->source->source); + // get the flattened form + auto it = flattened_map_.find(sum); + ICHECK(it != flattened_map_.end()); + IterSumExpr flattened_form = it->second; + // get the mark + auto it_mark = sum_fuse_map_.find(flattened_form); + ICHECK(it_mark != sum_fuse_map_.end()); + IterMark mark = it_mark->second; + mark.CopyOnWrite()->extent = min(predicate_induced_extent, mark->extent); + // update the bound of the lhs based on predicate_induced_extent + sum_fuse_map_[flattened_form] = mark; + // we need to note down the flattened form of constrained iterators + // to check the validity of constraints, see also CheckConstraints() + constrained_iters_flattened_.push_back(flattened_form); + expr.CopyOnWrite()->args = Array({opt.value()}); + return expr; + } + ++unresolved_count_; + return expr; + } + /*! * \brief Normalize expr to an iterator + offset. * \param expr The input expression. * \return The Normalized expression. */ IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) { + // We are normalizing a regular iter if (expr->args.size() <= 1) return expr; PrimExpr base = expr->base; expr.CopyOnWrite()->base = make_zero(expr->dtype); - auto opt = TryFuseIters(expr); + Optional opt = TryFuseIters(expr); expr.CopyOnWrite()->base = base; - if (opt) { + if (opt.defined()) { expr.CopyOnWrite()->args = Array({opt.value()}); return expr; } else { @@ -329,13 +487,6 @@ class IterMapRewriter : public ExprMutator { } } - bool CanProveEqual(PrimExpr lhs, PrimExpr rhs) { - const auto* clhs = lhs.as(); - const auto* crhs = rhs.as(); - if (clhs && crhs) return clhs->value == crhs->value; - return analyzer_->CanProve(lhs - rhs == 0); - } - /*! * \brief Create a IterSumExpr from expr. * \param expr The input expr. @@ -352,22 +503,24 @@ class IterMapRewriter : public ExprMutator { } } - // Try to normalize IterSum into a fused IterMark - // return a corresponding splitexpr if needed. - // IterSum = x1*c1 + x2*c2 + ... + xn*cn - // = (x1*s1 + x2*s2 + ... + xn)*cn - // = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn) - // = [IterSplit(IterMark(y), scale=cn)] - // return a corresponding IterSplitExpr if needed. + /*! + * \brief IterSum = x1*c1 + x2*c2 + ... + xn*cn + * = (x1*s1 + x2*s2 + ... + xn)*cn + * = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn) + * = [IterSplit(IterMark(y), scale=cn)] + * return a corresponding IterSplitExpr if needed. + * Try to normalize IterSum into a fused IterMark + * \param expr The input sum. + * \return The split with the fused IterMark if succeed. + */ Optional TryFuseIters(IterSumExpr expr) { if (!is_zero(expr->base)) return NullOpt; if (expr->args.size() == 1) return expr->args[0]; // select the iterators in order std::vector visited(expr->args.size(), false); - std::vector iters; - iters.reserve(expr->args.size()); - // canonicalize the expression - // find the base scale first + std::vector flattened_iters, grouped_iters; + // canonicalize the expression into two different forms: flattened form and structured form + // step0. check if find the base scale first Optional base_scale = NullOpt; size_t base_index = 0; for (size_t i = 0; i < expr->args.size(); ++i) { @@ -381,35 +534,87 @@ class IterMapRewriter : public ExprMutator { if (!base_scale) return NullOpt; // check if it can be remapped into a fused pattern. PrimExpr expected_scale = base_scale.value(); - for (size_t i = 0; i < expr->args.size(); ++i) { + for (size_t i = 0; i < expr->args.size();) { + // find j such that expr->args[j] has expected scale size_t j = i == 0 ? base_index : 0; for (; j < expr->args.size(); ++j) { - if (!visited[j] && CanProveEqual(expr->args[j]->scale, expected_scale)) break; + if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break; + } + if (j == expr->args.size()) return NullOpt; + // look for the longest constrained iter started from expr->args[j] + // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2) + // predicate: j*2 + k < 9 + // We need to match the predicate in expr and adjust the expected scale, + // otherwise we expect the scale of i to be 2*5=10 + Optional constraint_to_match; + for (const IterSumExpr& iter : constrained_iters_flattened_) { + if (IterSplitEqual(expr->args[j], iter->args.back(), false)) { + // find a predicate started from expr->args[j] + if (!constraint_to_match || + constraint_to_match.value()->args.size() < iter->args.size()) { + constraint_to_match = iter; + } + } } - if (j == expr->args.size()) { - return NullOpt; + if (constraint_to_match) { + // match the predicate and mark the iterators in the constraint_to_match as visited + // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2) + // predicate = j*2 + k < 9 + // then j*2 + k matches the lower two splits of expr + for (auto it = constraint_to_match.value()->args.rbegin(); + it != constraint_to_match.value()->args.rend(); ++it) { + size_t k = 0; + for (; k < expr->args.size(); ++k) { + if (!visited[k] && IterSplitEqual(expr->args[k], *it, false)) { + if (analyzer_->CanProveEqual((*it)->scale * expected_scale, expr->args[k]->scale)) + break; + } + } + if (k == expr->args.size()) return NullOpt; + visited[k] = true; + flattened_iters.push_back(expr->args[k]); + } + auto iter = sum_fuse_map_.find(constraint_to_match.value()); + ICHECK(iter != sum_fuse_map_.end()); + IterMark iter_matched = iter->second; + grouped_iters.emplace_back(iter_matched, expected_scale); + expected_scale *= iter_matched->extent; + // move forward + i += constraint_to_match.value()->args.size(); + } else { + // constraint_to_match not found, skip this iterator + visited[j] = true; + flattened_iters.push_back(expr->args[j]); + grouped_iters.push_back(expr->args[j]); + expected_scale *= expr->args[j]->extent; + ++i; } - visited[j] = true; - auto arg = expr->args[j]; - arg.CopyOnWrite()->scale = div(expr->args[j]->scale, base_scale.value()); - iters.push_back(arg); - expected_scale *= expr->args[j]->extent; - } - // update the iterator to use the canonicalized form - expr.CopyOnWrite()->args = Array(iters.rbegin(), iters.rend()); - auto it = sum_fuse_map_.find(expr); - if (it != sum_fuse_map_.end()) return it->second; - auto mark = IterMark(expr, div(expected_scale, base_scale.value())); - IterSplitExpr split(mark, base_scale.value()); - sum_fuse_map_[expr] = split; - return split; + } + // Get the flattened form and structured form + // both forms have splits from outermost to innermost + IterSumExpr structured_form = expr, flattened_form = expr; + flattened_form.CopyOnWrite()->args = + Array(flattened_iters.rbegin(), flattened_iters.rend()); + structured_form.CopyOnWrite()->args = + Array(grouped_iters.rbegin(), grouped_iters.rend()); + auto it = sum_fuse_map_.find(flattened_form); + if (it != sum_fuse_map_.end()) { + // old iter + return IterSplitExpr(it->second, base_scale.value()); + } else { + // new iter, form a new mark + IterMark mark = IterMark(structured_form, div(expected_scale, base_scale.value())); + sum_fuse_map_[flattened_form] = mark; + flattened_map_[structured_form] = flattened_form; + return IterSplitExpr(mark, base_scale.value()); + } } bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) { const auto* clhs = lhs.as(); const auto* crhs = rhs.as(); if (clhs && crhs) return clhs->value % crhs->value == 0; - return analyzer_->CanProve(floormod(lhs, rhs) == 0); + return analyzer_->CanProveEqual(lhs, rhs) || analyzer_->CanProve(floormod(lhs, rhs) == 0); } PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig); @@ -459,27 +664,87 @@ class IterMapRewriter : public ExprMutator { } }; +/*! \brief An internal struct to represent range extent on iterators(iter < upper_bound). */ +struct IterConstraint { + // The expr of the iter + PrimExpr iter; + // The expr of the upper_bound + PrimExpr upper_bound; + // The size of the iter, which is the number of nodes + size_t expr_size = 0; + + IterConstraint(PrimExpr iter, PrimExpr upper_bound, size_t size) + : iter(std::move(iter)), upper_bound(std::move(upper_bound)), expr_size(size) {} +}; + +/*! + * \brief Split the predicate into `(a < b) && (c < d) && ...` + * \param pred The predicate to be split. + * \return A list of pairs, each element of which are lhs and rhs of the '<' sign, + * empty if the split failed. + */ +std::vector MatchUpperBoundConstraints(PrimExpr pred) { + std::vector result; + arith::PVar lhs, rhs, rest; + for (;;) { + if ((rest && (lhs < rhs)).Match(pred)) { + result.emplace_back(lhs.Eval(), rhs.Eval(), 0); + pred = rest.Eval(); + } else if ((lhs < rhs).Match(pred)) { + result.emplace_back(lhs.Eval(), rhs.Eval(), 0); + break; + } else { + return std::vector(); + } + } + return result; +} + Array DetectIterMap(const Array& indices, const Map& input_iters, + const PrimExpr& predicate, bool require_bijective, arith::Analyzer* analyzer) { // Overall detection algorithm is divided into two steps: // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns. // - Step1: IterIndependenceChecker checks if the iterator are independent. + + std::vector constraints = MatchUpperBoundConstraints(predicate); + if (!is_one(predicate) && constraints.empty()) return Array(); + + // We have to make sure when we visit an iterator, all the constraints related with its successors + // in the iter var graph has been visited, where the expression of this iterator will contain the + // expression of its successor, so we sort them by their sizes. + for (IterConstraint& constraint : constraints) { + constraint.expr_size = CalculateExprComplexity(constraint.iter); + } + + std::sort( + constraints.begin(), constraints.end(), + [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; }); + IterMapRewriter rewriter(analyzer, input_iters); + // Step0.0: rewrite constraints in the order from size-small ones to size-big ones + for (const IterConstraint& constraint : constraints) { + PrimExpr res = rewriter.RewriteIterConstraint(constraint.iter, constraint.upper_bound); + if (rewriter.unresolved_count() != 0) return Array(); + } + if (!rewriter.CheckConstraints()) return Array(); + // Step0.1: rewrite indices Array results; - for (PrimExpr value : indices) { results.push_back(rewriter.Rewrite(value)); if (rewriter.unresolved_count() != 0) return Array(); } - if (!rewriter.CheckBijective(results)) return Array(); + // Step1: IterIndependenceChecker checks if the iterator are independent. + if (!rewriter.CheckMapping(results, require_bijective)) return Array(); return results; } TVM_REGISTER_GLOBAL("arith.DetectIterMap") - .set_body_typed([](const Array& indices, const Map& input_iters) { + .set_body_typed([](const Array& indices, const Map& input_iters, + const PrimExpr& input_pred, bool is_bijective) { arith::Analyzer ana; - return DetectIterMap(indices, input_iters, &ana); + return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana); }); PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) { @@ -675,7 +940,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) { if (a->IsInstance()) { IterSumExpr ret = Downcast(a); - if (auto opt = TryFuseIters(ret)) { + if (Optional opt = TryFuseIters(ret)) { return SplitFloorDivConst(opt.value(), b, GetRef(op)); } else { ++unresolved_count_; @@ -750,7 +1015,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) { if (a->IsInstance()) { IterSumExpr ret = Downcast(a); - if (auto opt = TryFuseIters(ret)) { + if (Optional opt = TryFuseIters(ret)) { return SplitFloorModConst(opt.value(), b, GetRef(op)); } else { ++unresolved_count_; diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc index dd90448335464..6aad5b7b0a256 100644 --- a/src/arith/solve_linear_inequality.cc +++ b/src/arith/solve_linear_inequality.cc @@ -39,58 +39,9 @@ namespace arith { using namespace tvm::runtime; using namespace tvm::tir; -#define PLUS_ONE(OP) \ - void VisitExpr_(const OP* op) final { num_symbols_++; } - -#define PLUS_ONE_BINARY(OP) \ - void VisitExpr_(const OP* op) final { \ - num_symbols_++; \ - VisitExpr(op->a); \ - VisitExpr(op->b); \ - } - -/*! - * \brief Calculate the expresion complexity based on number of symbols it contains. - */ -class ExprComplexity : public ExprVisitor { - public: - size_t Eval(const PrimExpr& expr) { - VisitExpr(expr); - return num_symbols_; - } - - PLUS_ONE_BINARY(AddNode) - PLUS_ONE_BINARY(SubNode) - PLUS_ONE_BINARY(MulNode) - PLUS_ONE_BINARY(DivNode) - PLUS_ONE_BINARY(ModNode) - PLUS_ONE_BINARY(FloorDivNode) - PLUS_ONE_BINARY(FloorModNode) - PLUS_ONE_BINARY(MinNode) - PLUS_ONE_BINARY(MaxNode) - PLUS_ONE_BINARY(EQNode) - PLUS_ONE_BINARY(NENode) - PLUS_ONE_BINARY(LTNode) - PLUS_ONE_BINARY(LENode) - PLUS_ONE_BINARY(GTNode) - PLUS_ONE_BINARY(GENode) - PLUS_ONE_BINARY(AndNode) - PLUS_ONE_BINARY(OrNode) - PLUS_ONE(VarNode) - PLUS_ONE(FloatImmNode) - PLUS_ONE(IntImmNode) - void VisitExpr_(const NotNode* op) final { - num_symbols_++; - VisitExpr(op->a); - } - - private: - size_t num_symbols_{0}; -}; - struct ExprLess { bool operator()(const PrimExpr& l, const PrimExpr& r) const { - return ExprComplexity().Eval(l) < ExprComplexity().Eval(r); + return CalculateExprComplexity(l) < CalculateExprComplexity(r); } }; diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py index 6ab61fdd95927..ac05809449bd0 100644 --- a/tests/python/unittest/test_arith_iter_affine_map.py +++ b/tests/python/unittest/test_arith_iter_affine_map.py @@ -19,13 +19,13 @@ from tvm import te -def ifuse(inputs): +def ifuse(inputs, pred_extent=None): """Fuse iterators""" value, extent = 0, 1 for i, ext in inputs: value = value * ext + i extent = extent * ext - return (value, extent) + return value, extent if pred_extent is None else pred_extent def isplit(axis, factor): @@ -67,7 +67,9 @@ def test_trivial(): assert_iter_sum_pattern(res[2], 1, 3) res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y])) - assert len(res) == 0 + assert len(res) == 2 + assert_iter_sum_pattern(res[0], 3, 0) + assert_iter_sum_pattern(res[1], 1, 3) # not independent res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y])) @@ -79,8 +81,6 @@ def test_fuse(): y = tvm.tir.Var("y", "int32") c = tvm.tir.SizeVar("c", "int32") c0 = tvm.tir.SizeVar("c0", "int32") - c1 = tvm.tir.SizeVar("c1", "int32") - c2 = tvm.tir.SizeVar("c1", "int32") res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)])) assert len(res) == 1 @@ -121,10 +121,8 @@ def test_fuse(): def test_split(): x = tvm.tir.Var("x", "int32") y = tvm.tir.Var("y", "int32") - z = tvm.tir.Var("y", "int32") c0 = tvm.tir.SizeVar("c0", "int32") c1 = tvm.tir.SizeVar("c1", "int32") - c2 = tvm.tir.SizeVar("c1", "int32") fld = tvm.tir.floordiv flm = tvm.tir.floormod @@ -196,8 +194,100 @@ def test_compound(): tvm.ir.assert_structural_equal(sz, res[0]) +def test_predicate(): + x = tvm.tir.Var("x", "int32"), 13 + y = tvm.tir.Var("y", "int32"), 10 + + res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 128) + + assert len(res) == 1 + assert_iter_sum_pattern(res[0], 128, 0) + + # duplicate constraint + res = tvm.arith.detect_iter_map( + [x[0] * 10 + y[0]], + var_dom([x, y]), + tvm.tir.all(x[0] * 10 + y[0] < 128, x[0] * 10 + y[0] < 64), + ) + + assert len(res) == 1 + assert_iter_sum_pattern(res[0], 64, 0) + + # useless constraint + res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 140) + + assert len(res) == 1 + assert_iter_sum_pattern(res[0], 130, 0) + + i1 = tvm.tir.Var("i1", "int32"), 7 + i2 = tvm.tir.Var("i2", "int32"), 2 + i3 = tvm.tir.Var("i3", "int32"), 4 + i4 = tvm.tir.Var("i4", "int32"), 3 + res = tvm.arith.detect_iter_map( + [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]], + var_dom([i1, i2, i3, i4]), + ( + tvm.tir.all( + i1[0] * 2 + i2[0] < 13, + i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128, + i3[0] * 3 + i4[0] < 10, + ) + ), + ) + assert len(res) == 1 + assert_iter_sum_pattern(res[0], 128, 0) + + i1 = tvm.tir.Var("i1", "int32"), 7 + i2 = tvm.tir.Var("i2", "int32"), 2 + i3 = tvm.tir.Var("i3", "int32"), 4 + i4 = tvm.tir.Var("i4", "int32"), 3 + + # wrong constraint + res = tvm.arith.detect_iter_map( + [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]], + var_dom([i1, i2, i3, i4]), + ( + tvm.tir.all( + i1[0] * 2 + i2[0] < 13, + i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128, + i3[0] * 3 + i4[0] < 7, + ) + ), + ) + assert len(res) == 0 + + # incompatible constraint + res = tvm.arith.detect_iter_map( + [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]], + var_dom([i1, i2, i3, i4]), + ( + tvm.tir.all( + i1[0] * 2 + i2[0] < 13, + i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128, + i3[0] * 3 + i4[0] < 10, + i1[0] * 4 + i3[0] < 20, + ) + ), + ) + assert len(res) == 0 + + res = tvm.arith.detect_iter_map( + [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]], + var_dom([i1, i2, i3, i4]), + ( + tvm.tir.all( + i1[0] * 2 + i2[0] < 13, + i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128, + i1[0] * 4 + i3[0] < 20, + ) + ), + ) + assert len(res) == 0 + + if __name__ == "__main__": test_split() test_trivial() test_fuse() test_compound() + test_predicate() From 2988a08e3ff4a8956ac9b23e662374f6d8f7f4d9 Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Sat, 27 Mar 2021 05:38:48 -0700 Subject: [PATCH 77/84] [docs] Getting Started: Introduction and Installation (#7638) * Getting Started: Introduction and Installation The first two sections of the "Getting Started with TVM" guide. A high level introduction to TVM, and a slight introduction on installation options. Co-authored-by: Tianqi Chen --- docs/conf.py | 2 + docs/index.rst | 2 +- tutorials/get_started/README.txt | 4 +- tutorials/get_started/install.py | 49 ++++++++++ tutorials/get_started/introduction.py | 132 ++++++++++++++++++++++++++ 5 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 tutorials/get_started/install.py create mode 100644 tutorials/get_started/introduction.py diff --git a/docs/conf.py b/docs/conf.py index 5ed244e585d0e..85ce4a6e0663d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -210,6 +210,8 @@ # The unlisted files always appear after listed files. within_subsection_order = { "get_started": [ + "introduction.py", + "install.py", "tvmc_command_line_driver.py", "tensor_expr_get_started.py", "autotvm_matmul.py", diff --git a/docs/index.rst b/docs/index.rst index 3131be5381fcc..323fb2a9d3130 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,7 +25,7 @@ Get Started ----------- - Follow the :doc:`instructions ` to install TVM. -- Checkout the :doc:`tutorials `. +- Checkout the :doc:`Getting Started with TVM Tutorial `. For Developers -------------- diff --git a/tutorials/get_started/README.txt b/tutorials/get_started/README.txt index 41b1b44055f17..aa6c559c1b385 100644 --- a/tutorials/get_started/README.txt +++ b/tutorials/get_started/README.txt @@ -1,2 +1,2 @@ -Get Started Tutorials ---------------------- +Getting Started With TVM +------------------------ diff --git a/tutorials/get_started/install.py b/tutorials/get_started/install.py new file mode 100644 index 0000000000000..a64d3cfd3b8cc --- /dev/null +++ b/tutorials/get_started/install.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Installing TVM +============== +**Authors**: +`Jocelyn Shiue `_, +`Chris Hoge `_ + +Depending on your needs and your working environment, there are a few different +methods for installing TVM. These include: + * Installing from source + * Installing from third-party binary package. +""" + +################################################################################ +# Installing from Source +# ---------------------- +# Installing from source is the recommended method for installing TVM. It will +# allow you to enable specific features such as GPU support, microcontroller +# support (uTVM), and a debugging runtime, and other features. You will also +# want to install from source if you want to actively contribute to the TVM +# project. The full instructions are on the `Install TVM From Source +# `_ page. + +################################################################################ +# Installing From Binary Packages +# -------------------------------- +# You may install convenient third-party binary package distributions to +# quickly try things out. TLCPack is a thirdparty volunteer community that +# builds binary packages from TVM source. It offers support matrix with +# instructions to install on different platforms, with different features. +# Checkout `TLCPack `_ to learn more. Note that the +# thirdparty binary packages could contain additional licensing terms for +# the hardware drivers that are bundled with it. diff --git a/tutorials/get_started/introduction.py b/tutorials/get_started/introduction.py new file mode 100644 index 0000000000000..0ee79d334c037 --- /dev/null +++ b/tutorials/get_started/introduction.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Introduction +============ +**Authors**: +`Jocelyn Shiue `_, +`Chris Hoge `_ + +Apache TVM is an open source machine learning compiler framework for CPUs, +GPUs, and machine learning accelerators. It aims to enable machine learning +engineers to optimize and run computations efficiently on any hardware backend. +The purpose of this tutorial is to take a guided tour through all of the major +features of TVM by defining and demonstrating key concepts. A new user should +be able to work through the tutorial from start to finish and be able to +operate TVM for automatic model optimization, while having a basic +understanding of the TVM architecture and how it works. + +Contents +-------- + +#. :doc:`Introduction ` +#. :doc:`Installing TVM ` +#. :doc:`Compiling and Optimizing a Model with TVMC ` +#. :doc:`Compiling and Optimizing a Model with the Python AutoScheduler ` +#. :doc:`Working with Operators Using Tensor Expressions ` +#. :doc:`Optimizing Operators with Templates and AutoTVM ` +#. :doc:`Optimizing Operators with AutoScheduling ` +#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) ` +#. :doc:`Compiling Deep Learning Models for GPUs ` +""" + +################################################################################ +# An Overview of TVM and Model Optimization +# ========================================= +# +# The diagram below illustrates the steps a machine model takes as it is +# transformed with the TVM optimizing compiler framework. +# +# .. image:: https://raw.githubusercontent.com/hogepodge/web-data/c339ebbbae41f3762873147c1e920a53a08963dd/images/getting_started/overview.png +# :width: 100% +# :alt: A High Level View of TVM +# +# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*. +# The importer layer is where TVM can ingest models from other frameworks, like +# ONNX, Tensorflow, or PyTorch. The level of support that TVM offers for each +# frontend varies as we are constantly improving the open source project. If +# you're having issues importing your model into TVM, you may want to try +# converting it to ONNX. +# +# 2. Translate to *Relay*, TVM's high level model language. +# A model that has been imported into TVM is represented in Relay. Relay is a +# functional language and intermediate representation (IR) for neural networks. +# It has support for: +# +# - Traditional data flow-style representations +# - Functional-style scoping, let-binding which makes it a fully featured +# differentiable language +# - Ability to allow the user to mix the two programming styles +# +# Relay applies several high-level optimization to the model, after which +# is runs the Relay Fusion Pass. To aid in the process of converting to +# Relay, TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined +# templates of common computations. +# +# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a +# higher-level representation is transformed into a lower-level +# representation. In Relay Fusion Pass, the model is lowered from the +# higher-level Relay representation into a smaller set of subgraphs, where +# each node is a task. A task is a collection of computation templates, +# expressed in TE, where there parameters of the template can control how +# the computation is carried out on hardware. The specific ordering of compuation, +# defined by parameters to the TE template, is called a schedule. +# +# 4. Search for optimized schedule using *AutoTVM* or *AutoScheduler* for each +# task through tuning. Tuning is the process of searching the TE parameter +# space for a schedule that is optimized for target hardware. There are +# couple of optimization options available, each requiring varying levels of +# user interaction. The optimization options include: +# +# - **AutoTVM**: The user specifies a search template for the schedule of a TE task, +# or TE subraph. AutoTVM directs the search of the parameter space defined by the +# template to produce an optimized configuration. AutoTVM requires users to +# define manually templates for each operator as part of the TOPI. +# - **Ansor/AutoSchedule**: Using a TVM Operator Inventory (TOPI) of operations, +# Ansor can automatically search an optimization space with much less +# intervention and guidance from the end user. Ansor depends on TE templates to +# guide the search. +# +# 5. Choose the optimal configuration for the model. After tuning, an optimal schedule +# for each task is chosen. Regardless if it is AutoTVM or AutoSchedule, +# schedule records in JSON format are produced that are referred to by this step +# to build an optimized model. +# +# 6. Lower to a hardware specific compiler. After selecting an optimized configuration +# based on the tuning step, the model is then lowered to a representation +# expected by the target compiler for the hardware platform. This is the +# final code generation phase with the intention of producing an optimized +# model that can be deployed into production. TVM supports a number of +# different compiler backends including: +# +# - LLVM, which can target arbitrary microprocessor architecture including +# standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any +# other platform supported by LLVM. +# - Specialized compilers, such as NVCC, NVIDIA's compiler. +# - Embedded and specialized targets, which are implemented through TVM's +# Bring Your Own Codegen (BYOC) framework. +# +# 7. Compile down to machine code. At the end of this process, the +# compiler-specific generated code can be lowered to machine code. +# +# TVM can compile models down to a linkable object module, which can then be +# run with a lightweight TVM runtime that provides C APIs to dynamically +# load the model, and entry points for other languages such as Python and +# Rust. TVM can also build a bundled deployment in which the runtime is +# combined with the model in a single package. +# +# The remainder of the tutorial will cover these aspects of TVM in more detail. From 18fa4958319b9d7ec579ef68feb0d8ad98bfe17e Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Sun, 28 Mar 2021 04:53:28 -0700 Subject: [PATCH 78/84] [Logging] Bring back the stack size optimization (#7756) --- include/tvm/runtime/logging.h | 103 +++++++++++++++++++++--------- include/tvm/runtime/packed_func.h | 12 +--- src/runtime/logging.cc | 14 ++++ 3 files changed, 90 insertions(+), 39 deletions(-) diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h index 08c6ff61252bd..8a69c2ad77490 100644 --- a/include/tvm/runtime/logging.h +++ b/include/tvm/runtime/logging.h @@ -35,18 +35,37 @@ #include #include #include +#include #include #include /*! - * \brief Macro helper for exception throwing. + * \brief Macro helper to force a function not to be inlined. + * It is only used in places that we know not inlining is good, + * e.g. some logging functions. + */ +#if defined(_MSC_VER) +#define TVM_NO_INLINE __declspec(noinline) +#else +#define TVM_NO_INLINE __attribute__((noinline)) +#endif + +/*! + * \brief Macro helper to force a function to be inlined. + * It is only used in places that we know inline is important, + * e.g. some template expansion cases. */ #ifdef _MSC_VER -#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn) +#define TVM_ALWAYS_INLINE __forceinline #else -#define TVM_THROW_EXCEPTION noexcept(false) +#define TVM_ALWAYS_INLINE inline __attribute__((always_inline)) #endif +/*! + * \brief Macro helper for exception throwing. + */ +#define TVM_THROW_EXCEPTION noexcept(false) + /*! * \brief Whether or not enable backtrace logging during a * fatal error. @@ -181,7 +200,7 @@ class Error : public ::dmlc::Error { // for backwards compatibility /*! * \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error - * contains a backtrace of where it occured. + * contains a backtrace of where it occurred. */ class InternalError : public Error { public: @@ -291,24 +310,36 @@ class LogMessage { /*! * \brief Class to accumulate an error message and throw it. Do not use * directly, instead use LOG(FATAL). + * \note The `LogFatal` class is designed to be an empty class to reduce stack size usage. + * To play this trick, we use the thread-local storage to store its internal data. */ class LogFatal { public: - LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + TVM_NO_INLINE LogFatal(const char* file, int lineno) { GetEntry().Init(file, lineno); } #ifdef _MSC_VER #pragma disagnostic push #pragma warning(disable : 4722) #endif - ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); } + ~LogFatal() TVM_THROW_EXCEPTION { GetEntry().Finalize(); } #ifdef _MSC_VER #pragma disagnostic pop #endif - std::ostringstream& stream() { return stream_; } + std::ostringstream& stream() { return GetEntry().stream_; } private: - std::ostringstream stream_; - std::string file_; - int lineno_; + struct Entry { + void Init(const char* file, int lineno) { + this->stream_.str(""); + this->file_ = file; + this->lineno_ = lineno; + } + TVM_NO_INLINE dmlc::Error Finalize() { throw InternalError(file_, lineno_, stream_.str()); } + std::ostringstream stream_; + std::string file_; + int lineno_; + }; + + TVM_DLL TVM_NO_INLINE static Entry& GetEntry(); }; /*! @@ -322,7 +353,7 @@ class LogMessage { stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno << ": "; } - ~LogMessage() { std::cerr << stream_.str() << std::endl; } + TVM_NO_INLINE ~LogMessage() { std::cerr << stream_.str() << std::endl; } std::ostringstream& stream() { return stream_; } private: @@ -361,19 +392,33 @@ inline bool DebugLoggingEnabled() { } constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE = + "\n" "---------------------------------------------------------------\n" "An internal invariant was violated during the execution of TVM.\n" "Please read TVM's error reporting guidelines.\n" "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n" "---------------------------------------------------------------\n"; -// Inline _Pragma in macros does not work reliably on old version of MVSC and +template +std::unique_ptr LogCheckFormat(const X& x, const Y& y) { + std::ostringstream os; + os << " (" << x << " vs. " << y << ") "; // CHECK_XX(x, y) requires x and y can be serialized to + // string. Use CHECK(x OP y) otherwise. + // no std::make_unique until c++14 + return std::unique_ptr(new std::string(os.str())); +} + +// Inline _Pragma in macros does not work reliably on old version of MSVC and // GCC. We wrap all comparisons in a function so that we can use #pragma to // silence bad comparison warnings. -#define TVM_CHECK_FUNC(name, op) \ - template \ - DMLC_ALWAYS_INLINE bool LogCheck##name(const A& a, const B& b) { \ - return a op b; \ +#define TVM_CHECK_FUNC(name, op) \ + template \ + TVM_ALWAYS_INLINE std::unique_ptr LogCheck##name(const X& x, const Y& y) { \ + if (x op y) return nullptr; \ + return LogCheckFormat(x, y); \ + } \ + TVM_ALWAYS_INLINE std::unique_ptr LogCheck##name(int x, int y) { \ + return LogCheck##name(x, y); \ } #pragma GCC diagnostic push @@ -390,18 +435,18 @@ TVM_CHECK_FUNC(_NE, !=) #define LOG(level) LOG_##level #define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() #define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() -#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "error: ") -#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "warning: ") +#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Error: ") +#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Warning: ") -#define TVM_CHECK_BINARY_OP(name, op, x, y) \ - if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ - ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ - << "Check failed: " << #x " " #op " " #y << ": " +#define TVM_CHECK_BINARY_OP(name, op, x, y) \ + if (auto __tvm__log__err = ::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check failed: " << #x " " #op " " #y << *__tvm__log__err << ": " #define CHECK(x) \ if (!(x)) \ ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ - << "Check failed: " #x << " == false: " + << "Check failed: (" #x << ") is false: " #define CHECK_LT(x, y) TVM_CHECK_BINARY_OP(_LT, <, x, y) #define CHECK_GT(x, y) TVM_CHECK_BINARY_OP(_GT, >, x, y) @@ -462,17 +507,17 @@ TVM_CHECK_FUNC(_NE, !=) #define TVM_ICHECK_INDENT " " -#define ICHECK_BINARY_OP(name, op, x, y) \ - if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ - ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ - << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ - << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << ": " +#define ICHECK_BINARY_OP(name, op, x, y) \ + if (auto __tvm__log__err = ::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ + << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *__tvm__log__err << ": " #define ICHECK(x) \ if (!(x)) \ ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << TVM_ICHECK_INDENT \ - << "Check failed: " #x << " == false: " + << "Check failed: (" #x << ") is false: " #define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y) #define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y) diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 65f7e76162742..58bd2859c10a5 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -46,14 +46,6 @@ #define TVM_RUNTIME_HEADER_ONLY 0 #endif -// Always inline macro only use in template -// expansion cases where we know inline is important. -#ifdef _MSC_VER -#define TVM_ALWAYS_INLINE __forceinline -#else -#define TVM_ALWAYS_INLINE inline __attribute__((always_inline)) -#endif - namespace tvm { namespace runtime { @@ -743,7 +735,7 @@ class TVMRetValue : public TVMPODValue_ { /*! \brief default constructor */ TVMRetValue() {} /*! - * \brief move constructor from anoter return value. + * \brief move constructor from another return value. * \param other The other return value. */ TVMRetValue(TVMRetValue&& other) : TVMPODValue_(other.value_, other.type_code_) { @@ -1119,7 +1111,7 @@ struct PackedFuncValueConverter { * }); * * // The following code will cause compilation error. - * // Because the same Function and ExortName + * // Because the same Function and ExportName * // TVM_DLL_EXPORT_TYPED_FUNC(AddOne_, AddOne_); * * // The following code is OK, assuming the macro diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc index 227814bcddd48..8f0537ad7adca 100644 --- a/src/runtime/logging.cc +++ b/src/runtime/logging.cc @@ -160,3 +160,17 @@ std::string Backtrace() { return ""; } } // namespace runtime } // namespace tvm #endif // TVM_LOG_STACK_TRACE + +#if (TVM_LOG_CUSTOMIZE == 0) +namespace tvm { +namespace runtime { +namespace detail { + +LogFatal::Entry& LogFatal::GetEntry() { + static thread_local LogFatal::Entry result; + return result; +} +} // namespace detail +} // namespace runtime +} // namespace tvm +#endif // TVM_LOG_CUSTOMIZE From c39a6e25d59d5a750c8c2e48da589c3189eb4069 Mon Sep 17 00:00:00 2001 From: "Matt Welsh (OctoML)" <63477620+mdw-octoml@users.noreply.github.com> Date: Sun, 28 Mar 2021 21:06:15 -0700 Subject: [PATCH 79/84] Clean up uTVM demo runtime, add ONNX model test and tutorial (#7557) * Some docstring fixes. * Couple of small fixes: - Use `west attach` instead of `west debug` in commandline to prevent debugger from resetting device. - Fix warning on use of led_pin in zephyr-runtime/src/main.c. * Adding Zephyr demo runtime. * Cleanup of uTVM tests and demo runtime. * Working on QEMU support. Need to add board-specific prj.conf files. * Adding board-specific prj.conf files. * Some cleanup. * Lots of hacking to get ONNX model to run on QEMU and nRF5340. Added test_onnx unit test. Still need to clean up tutorial. * Adding data for unit tests. * Cleanup demo_runtime code. * Fix up tutorial. * Couple of small fixes: - Use `west attach` instead of `west debug` in commandline to prevent debugger from resetting device. - Fix warning on use of led_pin in zephyr-runtime/src/main.c. * Adding Zephyr demo runtime. * Cleanup of uTVM tests and demo runtime. * Working on QEMU support. Need to add board-specific prj.conf files. * Adding board-specific prj.conf files. * Some cleanup. * Lots of hacking to get ONNX model to run on QEMU and nRF5340. Added test_onnx unit test. Still need to clean up tutorial. * Lots of hacking to get ONNX model to run on QEMU and nRF5340. Added test_onnx unit test. Still need to clean up tutorial. * Adding data for unit tests. * Cleanup demo_runtime code. * Fix up tutorial. * Fix tutorial. * Fix tutorial and runtime. * Fix merge conflicts. * Fix merge conflict. * Remove redundant files. * Revert dep. * Fixup * Add new files to check_file_type.py. * Adding missing ONNX file. * Fixup docs. * Fix linting rule. * small fixes * Add missing file to check_file_type.py. * clang-format this file. * Fix formatting. * Black formatting. * Lint comments. * Fix path for test. * Bump CI. * Update from_onnx. * fix path * Fixing * Revert dmlc-core to 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41 * Fix path again. * Fix tutorial to not use actual Zephyr. * Revert submodule version change * Fix bad merge. * Trying to fix this mess. * Fix formatting. * context -> device * Removing tutorial since I can't get it to pass CI. Co-authored-by: Mehrdad Hessar Co-authored-by: Andrew Reusch --- apps/microtvm/README.md | 17 +- .../{reference-vm/zephyr => }/pyproject.toml | 10 +- apps/microtvm/zephyr/README.md | 19 ++ .../zephyr/demo_runtime}/CMakeLists.txt | 1 - apps/microtvm/zephyr/demo_runtime/README.md | 21 ++ .../boards/nrf5340dk_nrf5340_cpuapp.conf | 31 +++ .../demo_runtime/boards/nucleo_f746zg.conf | 30 +++ .../zephyr/demo_runtime/boards/qemu_x86.conf | 11 +- .../zephyr/demo_runtime}/crt/crt_config.h | 2 +- .../microtvm/zephyr/demo_runtime}/prj.conf | 7 +- .../demo_runtime}/qemu-hack/qemu-system-i386 | 0 .../microtvm/zephyr/demo_runtime}/src/main.c | 222 +++++++++++------- docs/microtvm/index.rst | 2 +- python/tvm/micro/contrib/zephyr.py | 5 +- python/tvm/runtime/module.py | 2 +- python/tvm/target/target.py | 20 +- tests/lint/check_file_type.py | 13 +- tests/micro/qemu/.gitignore | 2 - tests/micro/qemu/zephyr-runtime/.gitignore | 3 - tests/micro/zephyr/README.md | 42 ++++ tests/micro/{qemu => zephyr}/conftest.py | 3 + tests/micro/{qemu => zephyr}/test_zephyr.py | 67 +++++- tests/micro/zephyr/testdata/digit-2.jpg | Bin 0 -> 572 bytes tests/micro/zephyr/testdata/digit-9.jpg | Bin 0 -> 535 bytes tests/micro/zephyr/testdata/mnist-8.onnx | Bin 0 -> 26454 bytes tests/scripts/task_python_microtvm.sh | 2 +- tutorials/micro/micro_tflite.py | 7 +- 27 files changed, 401 insertions(+), 138 deletions(-) rename apps/microtvm/{reference-vm/zephyr => }/pyproject.toml (95%) create mode 100644 apps/microtvm/zephyr/README.md rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/CMakeLists.txt (99%) create mode 100644 apps/microtvm/zephyr/demo_runtime/README.md create mode 100644 apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf create mode 100644 apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf rename tests/micro/qemu/zephyr-runtime/sample.yaml => apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf (78%) rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/crt/crt_config.h (98%) rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/prj.conf (87%) rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/qemu-hack/qemu-system-i386 (100%) rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/src/main.c (56%) delete mode 100644 tests/micro/qemu/.gitignore delete mode 100644 tests/micro/qemu/zephyr-runtime/.gitignore create mode 100644 tests/micro/zephyr/README.md rename tests/micro/{qemu => zephyr}/conftest.py (94%) rename tests/micro/{qemu => zephyr}/test_zephyr.py (81%) create mode 100644 tests/micro/zephyr/testdata/digit-2.jpg create mode 100644 tests/micro/zephyr/testdata/digit-9.jpg create mode 100644 tests/micro/zephyr/testdata/mnist-8.onnx diff --git a/apps/microtvm/README.md b/apps/microtvm/README.md index 97b844a4c01ba..362bc407238e7 100644 --- a/apps/microtvm/README.md +++ b/apps/microtvm/README.md @@ -15,14 +15,17 @@ -# microTVM Reference Virtual Machines +# microTVM +microTVM is the effort that allows TVM to build and execute models on bare-metal microcontrollers. -microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers. -These Virtual Machines are used to reproduce results and bugs when using microTVM with real -physical hardware. Note that they are not used to run Continuous Integration regression tests-- -those are instead run by the QEMU container (they run against an emulator, rather than real -hardware). +The `pyproject.toml` file in this directory can be used to create a +[Poetry](https://python-poetry.org/) Python environment with all of the required +dependencies installed for running microTVM. To use it, run: + +``` +$ poetry lock && poetry install +$ poetry shell +``` -See the "microTVM Reference Virtual Machines" tutorial for information on how to use these. diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/pyproject.toml similarity index 95% rename from apps/microtvm/reference-vm/zephyr/pyproject.toml rename to apps/microtvm/pyproject.toml index b4cfc544df583..8bfae0a157cdd 100644 --- a/apps/microtvm/reference-vm/zephyr/pyproject.toml +++ b/apps/microtvm/pyproject.toml @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +# This `pyproject.toml` file is used to allow MicroTVM +# to run within a Poetry-managed environment. + [tool.black] line-length = 100 target-version = ['py36'] @@ -47,12 +50,12 @@ exclude = ''' ) ''' [tool.poetry] -name = "tvm" +name = "microtvm" version = "0.1.0" description = "" -authors = ["Your Name "] +authors = [] packages = [ - { include = "tvm", from = "../../../../python" }, + { include = "tvm", from = "../../python" }, ] [tool.poetry.dependencies] @@ -67,7 +70,6 @@ typed_ast = "^1.4" pyyaml = "^5.4.1" pyserial = "^3.5" - # AutoTVM xgboost = {version = "^1.1", optional = true} diff --git a/apps/microtvm/zephyr/README.md b/apps/microtvm/zephyr/README.md new file mode 100644 index 0000000000000..ad00393c0805e --- /dev/null +++ b/apps/microtvm/zephyr/README.md @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + +This directory code to interface microTVM with the [Zephyr RTOS](https://zephyrproject.org/). + diff --git a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt b/apps/microtvm/zephyr/demo_runtime/CMakeLists.txt similarity index 99% rename from tests/micro/qemu/zephyr-runtime/CMakeLists.txt rename to apps/microtvm/zephyr/demo_runtime/CMakeLists.txt index ce5605469fcba..a99d5edb07e64 100644 --- a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt +++ b/apps/microtvm/zephyr/demo_runtime/CMakeLists.txt @@ -9,7 +9,6 @@ set(QEMU_PIPE "\${QEMU_PIPE}") # QEMU_PIPE is set by the calling TVM instance. find_package(Zephyr HINTS $ENV{ZEPHYR_BASE}) project(microtvm_zephyr_runtime) - set(CMAKE_VERBOSE_MAKEFILE ON) file(GLOB TVM_SOURCES ${CMAKE_SOURCE_DIR}/__tvm*.c) target_sources(app PRIVATE src/main.c ${TVM_SOURCES}) diff --git a/apps/microtvm/zephyr/demo_runtime/README.md b/apps/microtvm/zephyr/demo_runtime/README.md new file mode 100644 index 0000000000000..eab3f3d241a1f --- /dev/null +++ b/apps/microtvm/zephyr/demo_runtime/README.md @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + +This directory contains a Zephyr-based "demo" runtime environment that +pulls together the microTVM runtime dependencies into a single application +that can communicate with a Python-based host program via the UART, using +TVM's RPC protocol. diff --git a/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf b/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf new file mode 100644 index 0000000000000..149a69ea3b5bf --- /dev/null +++ b/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This file is specific to the nRF5340 DK board. + +# For intrinsics used by generated optimized operators. +CONFIG_CMSIS_DSP=y + +# Required for Cortex-M33 devices. +CONFIG_MAIN_STACK_SIZE=1536 + +# For random number generation. +CONFIG_ENTROPY_GENERATOR=y +CONFIG_TEST_RANDOM_GENERATOR=y + +# For debugging. +CONFIG_LED=y diff --git a/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf b/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf new file mode 100644 index 0000000000000..5931377d55aee --- /dev/null +++ b/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This file is specific to the nRF5340 DK board. + +# For intrinsics used by generated optimized operators. +CONFIG_CMSIS_DSP=y + +# Required for Cortex-M33 devices. +CONFIG_MAIN_STACK_SIZE=50 + +# For random number generation. +CONFIG_ENTROPY_GENERATOR=y + +# For debugging. +CONFIG_LED=y diff --git a/tests/micro/qemu/zephyr-runtime/sample.yaml b/apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf similarity index 78% rename from tests/micro/qemu/zephyr-runtime/sample.yaml rename to apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf index 88616b4acc408..e0e4ae2fb2d37 100644 --- a/tests/micro/qemu/zephyr-runtime/sample.yaml +++ b/apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. -sample: - description: uTVM RPC Server unit test - name: utvm rpc server -common: - tags: introduction +# This file is specific to the QEMU-emulated microTVM board. + +# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random. +CONFIG_TEST_RANDOM_GENERATOR=y +CONFIG_TIMER_RANDOM_GENERATOR=y + diff --git a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h b/apps/microtvm/zephyr/demo_runtime/crt/crt_config.h similarity index 98% rename from tests/micro/qemu/zephyr-runtime/crt/crt_config.h rename to apps/microtvm/zephyr/demo_runtime/crt/crt_config.h index a7f4f90b05387..f8fc7514a28db 100644 --- a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h +++ b/apps/microtvm/zephyr/demo_runtime/crt/crt_config.h @@ -59,6 +59,6 @@ /*! \brief Number of pages on device. */ #define TVM_CRT_MAX_PAGES 300 -//#define TVM_CRT_FRAMER_ENABLE_LOGS +// #define TVM_CRT_FRAMER_ENABLE_LOGS #endif // TVM_RUNTIME_CRT_CONFIG_H_ diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/apps/microtvm/zephyr/demo_runtime/prj.conf similarity index 87% rename from tests/micro/qemu/zephyr-runtime/prj.conf rename to apps/microtvm/zephyr/demo_runtime/prj.conf index 7be42b260bbba..bf2b330e35a67 100644 --- a/tests/micro/qemu/zephyr-runtime/prj.conf +++ b/apps/microtvm/zephyr/demo_runtime/prj.conf @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +# The settings in this file are generic for all boards, and are merged +# with the settings in the file boards/.conf by the Zephyr build +# process. + # For UART implementation in main(). CONFIG_RING_BUFFER=y CONFIG_UART_CONSOLE=n @@ -30,6 +34,3 @@ CONFIG_FPU=y # For TVMPlatformAbort(). CONFIG_REBOOT=y -# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random. -CONFIG_TEST_RANDOM_GENERATOR=y -CONFIG_TIMER_RANDOM_GENERATOR=y diff --git a/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386 b/apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386 similarity index 100% rename from tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386 rename to apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386 diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/apps/microtvm/zephyr/demo_runtime/src/main.c similarity index 56% rename from tests/micro/qemu/zephyr-runtime/src/main.c rename to apps/microtvm/zephyr/demo_runtime/src/main.c index bcd82b41c1813..e2aa59af7ad93 100644 --- a/tests/micro/qemu/zephyr-runtime/src/main.c +++ b/apps/microtvm/zephyr/demo_runtime/src/main.c @@ -22,8 +22,16 @@ * SPDX-License-Identifier: Apache-2.0 */ +/* + * This is a sample Zephyr-based application that contains the logic + * needed to control a microTVM-based model via the UART. This is only + * intended to be a demonstration, since typically you will want to incorporate + * this logic into your own application. + */ + #include #include +#include #include #include #include @@ -41,65 +49,107 @@ #include "crt_config.h" -K_SEM_DEFINE(tx_sem, 0, 1); - static const struct device* tvm_uart; -int write_hook(int c) { - uart_poll_out(tvm_uart, c); - return 0; -} +#ifdef CONFIG_LED +#define LED0_NODE DT_ALIAS(led0) +#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios) +#define LED0_PIN DT_GPIO_PIN(LED0_NODE, gpios) +#define LED0_FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios) +static const struct device* led0_pin; +#endif // CONFIG_LED +static size_t g_num_bytes_requested = 0; +static size_t g_num_bytes_written = 0; + +// Called by TVM to write serial data to the UART. ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) { +#ifdef CONFIG_LED + gpio_pin_set(led0_pin, LED0_PIN, 1); +#endif + g_num_bytes_requested += size; + for (size_t i = 0; i < size; i++) { uart_poll_out(tvm_uart, data[i]); + g_num_bytes_written++; } +#ifdef CONFIG_LED + gpio_pin_set(led0_pin, LED0_PIN, 0); +#endif + return size; } +// This is invoked by Zephyr from an exception handler, which will be invoked +// if the device crashes. Here, we turn on the LED and spin. +void k_sys_fatal_error_handler(unsigned int reason, const z_arch_esf_t* esf) { +#ifdef CONFIG_LED + gpio_pin_set(led0_pin, LED0_PIN, 1); +#endif + for (;;) + ; +} + +// Called by TVM when a message needs to be formatted. size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt, va_list args) { return vsnprintk(out_buf, out_buf_size_bytes, fmt, args); } +// Called by TVM when an internal invariant is violated, and execution cannot continue. void TVMPlatformAbort(tvm_crt_error_t error) { sys_reboot(SYS_REBOOT_COLD); +#ifdef CONFIG_LED + gpio_pin_set(led0_pin, LED0_PIN, 1); +#endif for (;;) ; } -K_MEM_POOL_DEFINE(tvm_memory_pool, 64, 1024, 120, 4); +// Called by TVM to generate random data. +tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { + uint32_t random; // one unit of random data. + + // Fill parts of `buffer` which are as large as `random`. + size_t num_full_blocks = num_bytes / sizeof(random); + for (int i = 0; i < num_full_blocks; ++i) { + random = sys_rand32_get(); + memcpy(&buffer[i * sizeof(random)], &random, sizeof(random)); + } + + // Fill any leftover tail which is smaller than `random`. + size_t num_tail_bytes = num_bytes % sizeof(random); + if (num_tail_bytes > 0) { + random = sys_rand32_get(); + memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes); + } + return kTvmErrorNoError; +} + +// Memory pool for use by TVMPlatformMemoryAllocate. +K_MEM_POOL_DEFINE(tvm_memory_pool, 64, 1024, 216, 4); +// Called by TVM to allocate memory. tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { *out_ptr = k_mem_pool_malloc(&tvm_memory_pool, num_bytes); return (*out_ptr == NULL) ? kTvmErrorPlatformNoMemory : kTvmErrorNoError; } +// Called by TVM to deallocate memory. tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { k_free(ptr); return kTvmErrorNoError; } -uint32_t g_utvm_start_time; - #define MILLIS_TIL_EXPIRY 200 #define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY)) K_TIMER_DEFINE(g_utvm_timer, /* expiry func */ NULL, /* stop func */ NULL); +uint32_t g_utvm_start_time; int g_utvm_timer_running = 0; -#ifdef CONFIG_LED -/* The devicetree node identifier for the "led0" alias. */ -#define LED0_NODE DT_ALIAS(led0) - -#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios) -#define PIN DT_GPIO_PIN(LED0_NODE, gpios) -#define FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios) - -static struct device* led_pin; -#endif // CONFIG_LED - +// Called to start system timer. tvm_crt_error_t TVMPlatformTimerStart() { if (g_utvm_timer_running) { TVMLogf("timer already running"); @@ -107,7 +157,7 @@ tvm_crt_error_t TVMPlatformTimerStart() { } #ifdef CONFIG_LED - gpio_pin_set(led_pin, PIN, 1); + gpio_pin_set(led0_pin, LED0_PIN, 1); #endif k_timer_start(&g_utvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY); g_utvm_start_time = k_cycle_get_32(); @@ -115,15 +165,16 @@ tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorNoError; } +// Called to stop system timer. tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { if (!g_utvm_timer_running) { TVMLogf("timer not running"); - return kTvmErrorPlatformTimerBadState; + return kTvmErrorSystemErrorMask | 2; } uint32_t stop_time = k_cycle_get_32(); #ifdef CONFIG_LED - gpio_pin_set(led_pin, PIN, 0); + gpio_pin_set(led0_pin, LED0_PIN, 0); #endif // compute how long the work took @@ -135,7 +186,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { } uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent); - double hw_clock_elapsed_seconds = ns_spent / 1e9; + double hw_clock_res_us = ns_spent / 1000.0; // need to grab time remaining *before* stopping. when stopped, this function // always returns 0. @@ -144,7 +195,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { // check *after* stopping to prevent extra expiries on the happy path if (time_remaining_ms < 0) { TVMLogf("negative time remaining"); - return -1; + return kTvmErrorSystemErrorMask | 3; } uint32_t num_expiries = k_timer_status_get(&g_utvm_timer); uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms); @@ -153,113 +204,112 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { // if we approach the limits of the HW clock datatype (uint32_t), use the // coarse-grained timer result instead if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) { - *elapsed_time_seconds = timer_res_ms / 1e3; + *elapsed_time_seconds = timer_res_ms / 1000.0; } else { - *elapsed_time_seconds = hw_clock_elapsed_seconds; + *elapsed_time_seconds = hw_clock_res_us / 1e6; } g_utvm_timer_running = 0; return kTvmErrorNoError; } -tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { - uint32_t random; // one unit of random data. - - // Fill parts of `buffer` which are as large as `random`. - size_t num_full_blocks = num_bytes / sizeof(random); - for (int i = 0; i < num_full_blocks; ++i) { - random = sys_rand32_get(); - memcpy(&buffer[i * sizeof(random)], &random, sizeof(random)); - } - - // Fill any leftover tail which is smaller than `random`. - size_t num_tail_bytes = num_bytes % sizeof(random); - if (num_tail_bytes > 0) { - random = sys_rand32_get(); - memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes); - } - - return kTvmErrorNoError; -} - -#define RING_BUF_SIZE 512 -struct uart_rx_buf_t { - struct ring_buf buf; - uint32_t buffer[RING_BUF_SIZE]; -}; +// Ring buffer used to store data read from the UART on rx interrupt. +#define RING_BUF_SIZE_BYTES 4 * 1024 +RING_BUF_DECLARE(uart_rx_rbuf, RING_BUF_SIZE_BYTES); -struct uart_rx_buf_t uart_rx_buf; +// Small buffer used to read data from the UART into the ring buffer. +static uint8_t uart_data[32]; +// UART interrupt callback. void uart_irq_cb(const struct device* dev, void* user_data) { while (uart_irq_update(dev) && uart_irq_is_pending(dev)) { - struct uart_rx_buf_t* buf = (struct uart_rx_buf_t*)user_data; - if (uart_irq_rx_ready(dev) == 0) { - continue; - } - - uint8_t data[32]; - for (;;) { - int bytes_read = uart_fifo_read(dev, data, sizeof(data)); - if (bytes_read < 0) { - TVMPlatformAbort(0xbeef); - } else if (bytes_read == 0) { - break; + struct ring_buf* rbuf = (struct ring_buf*)user_data; + if (uart_irq_rx_ready(dev) != 0) { + for (;;) { + // Read a small chunk of data from the UART. + int bytes_read = uart_fifo_read(dev, uart_data, sizeof(uart_data)); + if (bytes_read < 0) { + TVMPlatformAbort((tvm_crt_error_t)0xbeef1); + } else if (bytes_read == 0) { + break; + } + // Write it into the ring buffer. + int bytes_written = ring_buf_put(rbuf, uart_data, bytes_read); + if (bytes_read != bytes_written) { + TVMPlatformAbort((tvm_crt_error_t)0xbeef2); + } + // CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read, + // bytes_written); } - int bytes_written = ring_buf_put(&buf->buf, data, bytes_read); - CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read, - bytes_written); } } } -void uart_rx_init(struct uart_rx_buf_t* buf, const struct device* dev) { - ring_buf_init(&buf->buf, RING_BUF_SIZE, buf->buffer); - uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)buf); +// Used to initialize the UART receiver. +void uart_rx_init(struct ring_buf* rbuf, const struct device* dev) { + uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)rbuf); uart_irq_rx_enable(dev); } -int uart_rx_buf_read(struct uart_rx_buf_t* buf, uint8_t* data, size_t data_size_bytes) { +// Used to read data from the UART. +int uart_rx_buf_read(struct ring_buf* rbuf, uint8_t* data, size_t data_size_bytes) { unsigned int key = irq_lock(); - int bytes_read = ring_buf_get(&buf->buf, data, data_size_bytes); + int bytes_read = ring_buf_get(rbuf, data, data_size_bytes); irq_unlock(key); return bytes_read; } +// Buffer used to read from the UART rx ring buffer and feed it to the UTvmRpcServerLoop. +static uint8_t main_rx_buf[RING_BUF_SIZE_BYTES]; + +// The main function of this application. extern void __stdout_hook_install(int (*hook)(int)); void main(void) { #ifdef CONFIG_LED - led_pin = device_get_binding(LED0); - if (led_pin == NULL) { + int ret; + led0_pin = device_get_binding(LED0); + if (led0_pin == NULL) { for (;;) ; } - int ret = gpio_pin_configure(led_pin, PIN, GPIO_OUTPUT_ACTIVE | FLAGS); + ret = gpio_pin_configure(led0_pin, LED0_PIN, GPIO_OUTPUT_ACTIVE | LED0_FLAGS); if (ret < 0) { - for (;;) - ; + TVMPlatformAbort((tvm_crt_error_t)0xbeef4); } - gpio_pin_set(led_pin, PIN, 0); + gpio_pin_set(led0_pin, LED0_PIN, 1); #endif - /* Claim console device */ + // Claim console device. tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console))); - uart_rx_init(&uart_rx_buf, tvm_uart); - __stdout_hook_install(&write_hook); + uart_rx_init(&uart_rx_rbuf, tvm_uart); + // Initialize microTVM RPC server, which will receive commands from the UART and execute them. utvm_rpc_server_t server = UTvmRpcServerInit(write_serial, NULL); - TVMLogf("uTVM On-Device Runtime"); + TVMLogf("microTVM Zephyr runtime - running"); +#ifdef CONFIG_LED + gpio_pin_set(led0_pin, LED0_PIN, 0); +#endif + // The main application loop. We continuously read commands from the UART + // and dispatch them to UTvmRpcServerLoop(). while (true) { - uint8_t buf[256]; - int bytes_read = uart_rx_buf_read(&uart_rx_buf, buf, sizeof(buf)); + int bytes_read = uart_rx_buf_read(&uart_rx_rbuf, main_rx_buf, sizeof(main_rx_buf)); if (bytes_read > 0) { size_t bytes_remaining = bytes_read; - uint8_t* cursor = buf; + uint8_t* cursor = main_rx_buf; while (bytes_remaining > 0) { + // Pass the received bytes to the RPC server. tvm_crt_error_t err = UTvmRpcServerLoop(server, &cursor, &bytes_remaining); if (err != kTvmErrorNoError && err != kTvmErrorFramingShortPacket) { TVMPlatformAbort(err); } + if (g_num_bytes_written != 0 || g_num_bytes_requested != 0) { + if (g_num_bytes_written != g_num_bytes_requested) { + TVMPlatformAbort((tvm_crt_error_t)0xbeef5); + } + g_num_bytes_written = 0; + g_num_bytes_requested = 0; + } } } } diff --git a/docs/microtvm/index.rst b/docs/microtvm/index.rst index 2371219af27fa..a67b1547d2290 100644 --- a/docs/microtvm/index.rst +++ b/docs/microtvm/index.rst @@ -43,7 +43,7 @@ demos run against QEMU and the following hardware: * `STM Nucleo-F746ZG `_ * `STM STM32F746 Discovery `_ -* `nRF 5340 Preview Development Kit `_ +* `nRF 5340 Development Kit `_ Getting Started with microTVM diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py index cd9c23cd2f9de..104d955835a1f 100644 --- a/python/tvm/micro/contrib/zephyr.py +++ b/python/tvm/micro/contrib/zephyr.py @@ -650,10 +650,10 @@ def popen_kwargs(self): env = dict(os.environ) env["ZEPHYR_BASE"] = self._zephyr_base - return dict( + args = dict( args=self._west_cmd + [ - "debug", + "attach", "--skip-rebuild", "--build-dir", self._build_dir, @@ -662,3 +662,4 @@ def popen_kwargs(self): ], env=env, ) + return args diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 41d60683aa3b8..5165ae0854faa 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -454,7 +454,7 @@ def load_module(path, fmt=""): files = [tar_temp.relpath(x) for x in tar_temp.listdir()] _cc.create_shared(path + ".so", files, cc=cc) path += ".so" - # TODO(weberlo): we should probably use a more distinctive suffix for uTVM object files + # TODO(weberlo): we should probably use a more distinctive suffix for microTVM object files elif path.endswith(".obj"): fmt = "micro_dev" # Redirect to the load API diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index 8c60260e640ad..e3ef51158c5a7 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -237,26 +237,28 @@ def intel_graphics(model="unknown", options=None): return Target(" ".join(["opencl"] + opts)) +MICRO_SUPPORTED_MODELS = { + "host": [], + "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"], + "nrf5340dk": ["-mcpu=cortex-m33"], +} + + def micro(model="unknown", options=None): """Returns a microTVM target. Parameters ---------- model : str - Canonically identifies the target device. This is typically a CPU or board level name (other - flags such as -mcpu identify the ISA). + Canonically identifies the target device. This is typically a device board level name. + The allowed values are MICRO_SUPPORTED_MODELS.keys(). options : str or list of str Additional options """ - trans_table = { - "host": [], - "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"], - "nrf5340dk": ["-mcpu=cortex-m33"], - } - if model not in trans_table: + if model not in MICRO_SUPPORTED_MODELS: raise ValueError(f"Model {model} not supported by tvm.target.micro.") opts = _merge_opts( - trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], + MICRO_SUPPORTED_MODELS[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options, ) diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index ab51b6c79c83b..8d8b34322de30 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -125,9 +125,16 @@ "docs/_static/img/tvm-logo-square.png", # pytest config "pytest.ini", - # Zephyr tests - "tests/micro/qemu/zephyr-runtime/prj.conf", - "tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386", + # microTVM tests + "tests/micro/zephyr/testdata/digit-2.jpg", + "tests/micro/zephyr/testdata/digit-9.jpg", + "tests/micro/zephyr/testdata/mnist-8.onnx", + # microTVM Zephyr runtime + "apps/microtvm/zephyr/demo_runtime/prj.conf", + "apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf", + "apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf", + "apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf", + "apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386", # microTVM Virtual Machines "apps/microtvm/reference-vm/zephyr/Vagrantfile", "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template", diff --git a/tests/micro/qemu/.gitignore b/tests/micro/qemu/.gitignore deleted file mode 100644 index c920d8f93ff8d..0000000000000 --- a/tests/micro/qemu/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/test_zephyr*-workspace -/*.micro-binary diff --git a/tests/micro/qemu/zephyr-runtime/.gitignore b/tests/micro/qemu/zephyr-runtime/.gitignore deleted file mode 100644 index 64be5d3a487cd..0000000000000 --- a/tests/micro/qemu/zephyr-runtime/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -__tvm* -libtvm__* -/build diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md new file mode 100644 index 0000000000000..9769cae2b53b2 --- /dev/null +++ b/tests/micro/zephyr/README.md @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + +This directory contains tests for MicroTVM's integration with Zephyr. + +To run the test, you first need to be running in a Python environment with +all of the appropriate TVM dependencies installed. If you have [Poetry](https://python-poetry.org/) +installed, you can do the following to get an appropriately-configured Python +environment: + +``` +$ cd tvm/apps/microtvm/ +$ poetry lock && poetry install && poetry shell +``` + +You can then run this test (either on real hardware or on a QEMU-emulated +device) using: + +``` +$ cd tvm/tests/micro/zephyr +$ pytest test_zephyr.py --microtvm-platforms=host # For QEMU emulation +$ pytest test_zephyr.py --microtvm-platforms=nrf5340dk # For nRF5340DK +``` + +To see the list of supported values for `--microtvm-platforms`, run: +``` +$ pytest test_zephyr.py --help +``` diff --git a/tests/micro/qemu/conftest.py b/tests/micro/zephyr/conftest.py similarity index 94% rename from tests/micro/qemu/conftest.py rename to tests/micro/zephyr/conftest.py index 3fc54df020631..e8ce443adfaf7 100644 --- a/tests/micro/qemu/conftest.py +++ b/tests/micro/zephyr/conftest.py @@ -16,11 +16,14 @@ # under the License. import pytest +import tvm.target.target + def pytest_addoption(parser): parser.addoption( "--microtvm-platforms", default="host", + choices=tvm.target.target.MICRO_SUPPORTED_MODELS.keys(), help=( "Specify a comma-separated list of test models (i.e. as passed to tvm.target.micro()) " "for microTVM tests." diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py similarity index 81% rename from tests/micro/qemu/test_zephyr.py rename to tests/micro/zephyr/test_zephyr.py index 1e7eed7b906a5..003cd54bba90a 100644 --- a/tests/micro/qemu/test_zephyr.py +++ b/tests/micro/zephyr/test_zephyr.py @@ -19,12 +19,15 @@ import copy import datetime import glob +import logging import os import subprocess import sys import pytest import numpy as np +import onnx +from PIL import Image import tvm import tvm.rpc @@ -36,11 +39,14 @@ from tvm.relay.expr_functor import ExprMutator from tvm.relay.op.annotation import compiler_begin, compiler_end +# If set, build the uTVM binary from scratch on each test. +# Otherwise, reuses the build from the previous test run. BUILD = True -DEBUG = False - -TARGET = None +# If set, enable a debug session while the test is running. +# Before running the test, in a separate shell, you should run: +# python -m tvm.exec.microtvm_debug_shell +DEBUG = False def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs): @@ -62,15 +68,17 @@ def _make_session(model, target, zephyr_board, west_cmd, mod): os.makedirs(workspace_parent) workspace = tvm.micro.Workspace(debug=True, root=workspace_root) - project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime") + test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + tvm_source_dir = os.path.join(test_dir, "..", "..", "..") + runtime_path = os.path.join(tvm_source_dir, "apps", "microtvm", "zephyr", "demo_runtime") compiler = zephyr.ZephyrCompiler( - project_dir=project_dir, + project_dir=runtime_path, board=zephyr_board, zephyr_toolchain_variant="zephyr", west_cmd=west_cmd, ) - opts = tvm.micro.default_options(f"{project_dir}/crt") + opts = tvm.micro.default_options(os.path.join(runtime_path, "crt")) # TODO(weberlo) verify this is necessary opts["bin_opts"]["ccflags"] = ["-std=gnu++14"] opts["lib_opts"]["ccflags"] = ["-std=gnu++14"] @@ -201,6 +209,53 @@ def test_relay(platform, west_cmd): tvm.testing.assert_allclose(result, x_in * x_in + 1) +def test_onnx(platform, west_cmd): + """Testing a simple ONNX model.""" + model, zephyr_board = PLATFORMS[platform] + + # Load test images. + this_dir = os.path.dirname(__file__) + digit_2 = Image.open(f"{this_dir}/testdata/digit-2.jpg").resize((28, 28)) + digit_2 = np.asarray(digit_2).astype("float32") + digit_2 = np.expand_dims(digit_2, axis=0) + + digit_9 = Image.open(f"{this_dir}/testdata/digit-9.jpg").resize((28, 28)) + digit_9 = np.asarray(digit_9).astype("float32") + digit_9 = np.expand_dims(digit_9, axis=0) + + # Load ONNX model and convert to Relay. + onnx_model = onnx.load(f"{this_dir}/testdata/mnist-8.onnx") + shape = {"Input3": (1, 1, 28, 28)} + relay_mod, params = relay.frontend.from_onnx(onnx_model, shape=shape, freeze_params=True) + relay_mod = relay.transform.DynamicToStatic()(relay_mod) + + # We add the -link-params=1 option to ensure the model parameters are compiled in. + # There is currently a bug preventing the demo_runtime environment from receiving + # the model weights when set using graph_mod.set_input(). + # See: https://github.com/apache/tvm/issues/7567 + target = tvm.target.target.micro(model, options=["-link-params=1"]) + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + lowered = relay.build(relay_mod, target, params=params) + graph = lowered.get_json() + + with _make_session(model, target, zephyr_board, west_cmd, lowered.lib) as session: + graph_mod = tvm.micro.create_local_graph_runtime( + graph, session.get_system_lib(), session.device + ) + + # Send the digit-2 image and confirm that the correct result is returned. + graph_mod.set_input("Input3", tvm.nd.array(digit_2)) + graph_mod.run() + result = graph_mod.get_output(0).asnumpy() + assert np.argmax(result) == 2 + + # Send the digit-9 image and confirm that the correct result is returned. + graph_mod.set_input("Input3", tvm.nd.array(digit_9)) + graph_mod.run() + result = graph_mod.get_output(0).asnumpy() + assert np.argmax(result) == 9 + + class CcompilerAnnotator(ExprMutator): """ This is used to create external functions for ccompiler. diff --git a/tests/micro/zephyr/testdata/digit-2.jpg b/tests/micro/zephyr/testdata/digit-2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b709a206b8d776215dcaa78643b22fe628b3c43a GIT binary patch literal 572 zcmV-C0>l0P*#F=F5K2Z#MgRc;000310RRC1+Wgv=4-_35A08bV92_7dE+-%&EF&BoC^soAFflYVG#@89JvcHvE;BST|G)qX2ml-a z9036l0RO}Q9{>OW1pxs80RaI300000000010s{mE1_uZU3Jd?l0JRVR0s#X90t5pE z1q1{D00Dgg0s{a95d{(Xb($mz{*4NnC+Tr5k)}+G+9pM!N9a z+Fgtq;Ufks4k5G6O=vOe_>A$lT&9{hIqq>&me#H&a?UfhKQ?v%>I1(TC zPo;Q8dSAiM82D?!`n~+#AhvSzq6>Sai_G#i?k&erpO=siPeWdhbMfE9x=i;{+Ft7# z+e`yT6p%#>F!kUap}qMH^{zG#jmxgXeWv)|M1xS$?n4;X*+RSr2LNRA&OU5{I`h)J Keu^lfzyI08i0m@} literal 0 HcmV?d00001 diff --git a/tests/micro/zephyr/testdata/digit-9.jpg b/tests/micro/zephyr/testdata/digit-9.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6ce9cde3b322b351847da179555aba35358dbf23 GIT binary patch literal 535 zcmV+y0_gq!*#F=F5K2Z#MgRc;000310RRC1+Wgv=4-_35A08bV92_7dE+-%&EF&BoC^soAFflYVG#@89JvcHvE;BST|G)qX2ml-a z9036l0RO}Q9{>OW1pxs80RaI300000000010s{mE1_uZU3Jd?l0JRVR0s#X90t5pE z1q1{D00Dgg0s{a95d{(Xb($mz{*4NnC+Tr5k`=;qQwz?HTobGeo$#u}uE)N`lzywMpFn z0PqlNo02z*Mvg>ebR}6>V1hb=DYU8qU<(BV@=t2}8cXd%P4Nbe;f*80*GEW!Bi*PZ z?Hn;69XR{sV~|fd_pc58pmf`NABf335rk{YDCd_0tL|ja5+s?$04fGV5X?$Wf+%7_5d(^VqN1We zS7AU5Ac{Go7%*eb%3J%KbNAUN+&9Mi^G5$zqpNGqRb4f^W`%F6$ZIPM@eLYjsMz1m z$jL2UkZmqkp{%jSi2QH8m>SA@tvV zwDtbihlX8(BIbq1^mpk0PkuAyfPWv^U&c*FtxQ%;X?SE*>_SI1wSUgwq}_GM{K&ZZ zb7L1q%#ZBv66zPbP|6b8&#?1fXKhXYxv?>h{r*#GrGF=w$qxz(`?q-psQq^y zZOwr2QNfP=|7Fm>6U-C>!so{R+o-j&YX6-}TibvB{J9a4Go79PbEN7&X=chkQy2f^ z?tkb=&O`3M>F7TwUDOn$>WT4?`SZJqzHpo8GIx*M?cVy!JLGj){T?LALDrfOyhfPkv={3RXO>g;zgp#IPQ7arwSBKK#^N z5<(-z+tza6*SM7(%hn6uU)Brlrt7gsH+yuLxS5vKtQGj|Ejp1^OuvT2!V&!L>g>>+ z7k2sv7nk3MXECSXROcM}9DY@_if@7Tv8Ci>S^{U@II_clKAanP3%pkK=Z#7GsNmCE z?DXuLP;G6_5uu}SaeEU*5BS5+YxkkqSto4TaTVuA{f0mG*=)$~_=7%k%kMJrfVMoF zwRwXWaRx7~p2x;Fui)E)D#0wS5c;pPt!-Yvl>V-92c1_+G`pk=zS}L&R_Ct^`gPi3 zcEv!f?5s*7mpv3`&Q(O`5I0h2nE+OQD}>q0&(yq`KaxgPo3Y!=hj7Evj{E3~>P&RJ?A8@-s zU$K6t-=>xPxL09>N^g@)8-h^J??_V?e`(##4t`6BQNaUod82pqJ?;2Jvp@Ah1GAD zvdq&Zf?PovZyu#YrBBRQW38V1mDL+*iL!$GUO7wL=i(cp4r*G^~WeKio z*Ts-=ThQy_WcF~~3tN_^qknE5Z&;O%C*Sz-xtzbWZpj(Rq}p(t-K>tqTb_v^sOF^ApN;>uga_NmWm=f#4Zf+-Vde9vHRrgo& z=+rpA+TKhj*H**McbPaQRhOp}SHQ-G91^;zNu8%cj8@#7gO@SN>Ho)L*~oAa{Re(G;c#b9c+?y z@3GMmH8oVBU;iLc+v_phPjlt=t|2_4(GJ(d24iH)btrmng~2C-==ZYC_;K17vRv)N z(MsCf?uj+#M|$wj%}c4itAL#?v(PT>1btiZk(9k!ggt%Au)a=*_q48OwRMU7%HN=dbJPnc_qGjGeH)|&HV;Nh9%P2@#}bSngo??|KP0lO?b;-HN%IWHGU)h z@)PkOzN$G51Mg?EPqi10{<4@?xQC(j7Z;55E$71c3ql{wuXOobDNZ>S!)0@S3H_T( zh1K2W(w8wKMSPXOBe(45r#t@Q&i73?B>jXy`r+JtyqqL7a5o!Od=`f1IpW846VU2( zI{q2?2nN4j%Qk0=ak_Rke$YO}`#1Ng?c{Wa$31rB^jXvK<8~!?YwW=JUnI3NXXeq~ zo^llI+}-`)=PWFK>&hlq`w4*_YtUoZQ23PPiH)UQc-gg+AitvvU*A528sa-+s@zMk zD(X@*`;)gYSYMvcwvEB@trwu>!~2>m<~7u|2D$D|Ex21w#fO)s3U^vI2}SOgU~%Rg z$nK~Bt7FNmQ8*gE2k3 z-D8LQhN-%T`;21skgA%{mqWRy%wNi!kqHNP{UEc*P)O*R&pIzOt9NX-$4_=%?%R)K zvYzh(Q2nIM+k8)nrrwp};IjGrwqG{)4QzrdwkoXkL5eu#0@J8SO=rzrO^djn>>WtAW0bbYR^Z=fOWR1pHoW!_0wcuK7x>q#$z7OKl)80Y7mq7mFcaR+V0HdE(;x^SmtoTfsEe1?N`3vjl z((^c~kDM)TdVU_(20x&fmm%;2idI?`@V7&e6D4z^3m)z^-LOPlA z;i!8m-<3p@if1KXc^8OXCuHK5eG^eh<~YCC(H3)j8rh+8is1S5wxG4S3(lMB$D2#m zpj^*;RT?H9M#-Cxfr8S|G_j$u zzr=Q>o@iM68d64O!TxL4DEsSM7+SDcI5_38FfIIpSb6UV#Eh}SJN9?!Mao;UE$GZ{ zMt)*IM+#i?I}m$zVci=X>3W2yoEFl>}1#a zO0l?6hYQm3DAdM`SC8+>im%d0bJQ{XuKidt`idb$r{!>(;uhiPs1jk>`7F?i_r-p; zXJJCw2O7HgE4-MOEf%fwgRG(!2pwlf`fs0$=ciA{5rK-Rsy&wKJUSrt^dT7jcLbQ! zcw?Uh#T*O)?D0E>)y`$J{=OwJ^Yl>c+haE*Me1|sLSMG;*g=Y#2gIv^rtY3<@~q`m zO=x(nS@SH@5GWOzn&lfbu+grS`<{oJ0Zp$-nRzlcRZMgHMlI;D3n(E=IylLrNR{CuUMni1*>kDUm=6H%K zo41JNft@hFqlo9;n2d%$OYlJQCq8HK49D**;Ik8qc~G_+Ui*EFo+w#kZf_eHq%6Vm zCW)l>_f{UBV~EdlGby*;hL8R1#BL51P_WmM>{sZMx2G$2wzPuHxqqRmxGzO6Pozmt z)L2|{0=Cc6!QDHD$F7MAfXP9uitt&pXIgY6_9ch<6quh};{!{OctAsIP)m&{3^Vg~E9i+hHVja-v z%^6&-+#N5EGUs`p^g&l|jTkq*GhI}6WP50V9-Fs-i8M~SCTC5?O_{KoP59vA-dGS2 zfvcao!}al>MdLr$0foVQ^{X0soIgP(jj6amT*o;tE!|(tR%bt-WVn;{Rmi`aja|k} zXNwmWIDbPd_iPB~@bU<;r_M_>)t@astRK#%L$bK*GAC4hxgJg){7e;trMZoN3yhrN z$c^JF39j0s+UFSBn(`7h+gsxn#iQIq=uDbX@9FOZRi5?5ob}z(FuPw1e6%=D+vYg1 z)9T4&zeWO=hZe&rzjDa)E`o&YMJ%_OQAuo~-~~OnG}9lFa!28Zx~1Hrp@zZ*O^H)b zhZvUm9bUYX<;p4m5 z2&8F~esH|CC8W(g00UFx-AlcsG4?A1T$F7=o&zTF*mFzq(_l3|8hlflce%K~ag^hA zwrTu){4a>Lnj!h@Wm?;KF%f*5-wRzd_T$IQ4`Q!jEpXzZG%s`X<7w9xV|+vl29NB1#@aoU<(?zAui zf9wyzYcGfJ1T4YOu&tc=+q$O6LWa)-$Kd+PWf;@l2NiXn(~+SG?CkV{nmzXM;6dYg zkz+o+7=0F(L_~AIt83_6@sVb}w1lY>q9A?DJuJskZauJBd=s&SSDKw5`MT>g#cs63 zw@QY0yfCJ7%bI!EOcu^K4rHzTBJ4Pv$fHI+frmpb!JFmsh;bt%9&z#*BzKKIde1?H zw}q_tQV(Sug5b^^SB&btjn?QU2~}20_{H+A=-&4(YbCG7b!Nt}*E>O|n6ne7C6tR= z*M@=8#SqbTQZ`&WRwv1M<;E8;9|uYD8$n0GfE+@9LGs0w(ECLT9GzKBDph@P(X>50 z;*J)#HYeee;tojqHG#gUKa`A+yc4=as)(%(5}csmCn&VWQ0Uz@iHf%+Iu{3mdxBC((0C*i%m5?QpA!>cn~Gw5nIR*hhq{-76x-$^FP z`wWkAG)IZgqsJbVCGn#ngwNDYA@YAiqWmc{5Wqt%-dCtvnI!jdp>Oy zR!w_M{`H!y?zIn07w*I#?{?wwPP#Q!YRceTzmZlfoyV)G1ZZFhDx7Gib-z0E#l`QT z$D@yAaBm4-jJ6fFf9}PpSq(H!zL{bVcOkRuBYE{iK)K7dZev~y5I+SXJ<^x1VWVLF zt6?whUHBN(9~ZGo@>FX2n1}sF`d}$-;=Ir6>Df$Ae8k`QviEwt@i7*z<#izce;7lT*#l;mIjSIAvx7S=cq>-p9SsB{2%4w(X+OchgU>&{P_ipU~v}JA?4+FM(gqFvD{;9nf^;j#%yQ1Jik* zP-;C7_TNk*uSKiGv3cdV z=XUABvfgJvQknpJ-`?TKcTByjGNJ39>tH$Y80wF=7JQ5M(9{urr1r{(#p-h5>RbWW zhV|vYD=UTamPd4T{$?8MtW#Um;}{;%`UVST1>k@~^0Z~MGA^)+=Cj63d)F#(X}5>) zCgCIZSnomV{g-j%r#H0an}jW{ZE_d=$MA;bow#S$1N7eR3jR3R2d6B13C3@JgUQQd z{C$QeX2k&?^-Bd!Egv*aDyCf_p1fP}03AABPM!TLYx?yI;i|z0!Q)VXBV2)tzo zF<-vZ?7uhQV$YL8^MuDVWonePW-y(W^zg0mm%BjgZD&I1=u%4lu#R#RO4#n_SoCP? zOr}4F;A(Y6OwGLmx09o}i|s>5om0q9&-6vJbK7a-AvyQWmLFhdlmv(DwCC@m#-P`W z&1_zH8b>S*5)aPKgAlt6u0QOHS0@ajojv}-h(BqBoC(7xq|*E!;h=R~g^KzD#awwx zzQMIM>n{uzUcW9Ae-DmF)wD2BRJZ>4;g9$HvB$9HD5-Paaj$xoZCQy=G><@H^*VG^^;L@vm9#U7IFO3 zmBNG=J?w}|!0u2@je88SY>+cSi7LuA$#JieVLTwd7ze*6ZrFE%zu)MMN;M<+D-q*pN~P^6AB2-K%Jj;b6G&;xEvm zBjUb?RrFZ8#|&+}NI$g&w3fz#lI)ed?~Jq-`$3+}7OLQx!;2`8w?g*|4s6)fRa)cO z#-r-)!^ZFqiK|})Oia$#Gc&A@)Q3AtB~Y@?8CsXz z5+~4F65}6$zNwP1M7JMTmssHEi@V6oJp>jVjKKQ2+c>c4J~*{afd_6^QSZWNp=TJd zdqNXfefuQrj0@wUZFlL|fn08D>@R6Puth8heJac~XcL7YPUvd8p*k(RFF*U}2GN6L zd3IAhEqi;Hwk=A8h3ke>3aPMiksQYlsS|Dvsu28A3gPTz8%_!d$LO&hxMIk6TB08d z&5E+T{&yrV{ zl;hDtIXoN@&ILou!Ew$&+I-fEHumf!j#z)6a@t>#{CP88+dm9F%HB}M=YE_RvPE=D zN)&x%e1)#I=Y^#|^ij3S26I=8Wt|gILd^nio;=Bz3r^VK1H~;6SNn+)9UEw7vL5IU zDMVpa9TliFQ=0a3JUr2xPsT6AS#}CstA1a2J}n6cri|pZ2jqEPK_&pwL$wJ=hk0g*eZLWVwtpGwDJw7cM-71jKjj2U%Tm->S|%l zFekDawH0>Pc1Eq*`8aCQZgG8U1}wO-hH{ts!k>Fa;N@pXxc1GX+3A}hy zcid)BOz$o@HT&7hqsX zcWfFoho4^ttlu^bp6a%WfqPHV=W(s{EV+~pth2_&(O!7wy(N!6v5tddrq{&%ZXvA~ zX_#?JnQFV&(nDJfPPSSH&7D&4-hzHuoSVo;+-A|hrVU^o8cP$BMY?fo2Jf&k;p>`P zs%`eAW17A!3crg`X_*3#?`s9NS;l;QMmuJYjOV@c_K?`Q8^`-@g6s8rS#L}xjZgh8 zNoa}}m5ieCmeE`6Q@@6zZg=Bl&IMFAdI2u92_(1mGVZHKD%K7&>Lxf=^%D*A>QJ-1 zCy(~IClp4+bLd+YG#xur7_u=!+H*UNUuyQjJM^W#mN^`mXMkgk6!_?cE0A=klC)cQ z(-gDEk}s2V`H=9Dp2!@AYem0F_0>6aN*gXYT{V(sXiVXn3&yy`Ko#dLdjreX$kS=V zTlDpK2u`(nBbM%*$D83g)+&KvIHAnEy zob&MD(*touR5xy1cUhcrcOi~jHc9ldX`~|+)s%Q7Nr?QEjYCJ>BAa=2v?8@HB-#5> z>6vltZ2AEfgpLEnrTT2{b5GQEt%YCrE>o|JdQst;516@{i2Y^nK$oH-FzsQ$sU_>7 z)3kliH=n8aC&P2K9L~4xAgQ{uhIcGEaFY1Cey&ed?H3zX@$PIeg z;~9)`_QJm{-yk!5r9>8O$-cilFFAQx@*?3c8MscA#%nQjUsQwsJ+sguk z9!e7yho+EfR425SB=CzHi|~0$5MI!9q{-8hvFU&l8a-9T=NY@X%aBH~x8Dlx{UU-p z`)ovYX&p4Te_oAm!g}a`qzD@|a-nv0*P07`?$KLm9GKc3Tm5Tsrt}-VM=n1zc>J(h zQgl4VHv0LT5_}X*Zl+Od-Y$ON7{SMEr&0E!PdIwSPW(jelIyu=d5BvRjp}TGy)OI} zOrDm*>~F`xV8vze&VlpPRGNaFt_{a1{8i)r>o~TMmm6fLO z;*t^kyvGmttgsq}_teFf@l)A!8d7skiFj#ev0F-J7n~Fu&(;bngv;U~%ndorzx$O5 z^&bi_*f9!Zt{}J^ohHq}{?c*tLh1gng*La{Bh%!~RJhBKR;eaIuKNqveYF$YEwsVs zV-=w)@;-!m_om5R%xYSV3Q&J~AFiLfj-P+2#J7D+QSPrBo;ak3?Ymvr&a>UX!+IEWR+6 z)_PuwuzYe6efyG0-!~WWpX{;ttU!gI9*g3MX#@Gj*d>r~(4N+hpNu5~19@nx500I- z2NYL@W2BjG?R*(8vJQ)8@3Q@vq@|6eYMb%U-wc-d;|I(8#){>S)wpVAys&uE457E5 zn){-&ci4@Rxnt-&=>0bpGxLi;ZqH(lmH!5ldc{(+`bPV0Bju_nJ+^ZX;WThjugQ z%zQH(_CSY+hFRgRy~eDlWy_~`*uu@JBF+7K22?KG5<5K_KqlG!`Ch_0+@0{92NZ7P z;d4?^Bl9R7SU8?b&$eQKS`+EDUqIikoofx17DCEzW1d{Bg2nb9u~<(6^3H=<(yH!W z-dMq5EaR?hn$G)Tc2S4TS@ylP4Y}P0lU81YmXfYw&Xq7YD~&7f&DDaf$)V(Fx2r}q zTbbWIxk8Vh{1rYpe;1wg4p7`QBTBh?ooYSnNwWDex!3MNmy8TfzC02_w%-Bw?xm8C z4=wp%|0XC1pF~GyjN!khK7+M#Dt7Cv&g0(83Awwfarl`$cD`eX)^>jcy`j}G?Z+F= z|2mnEI~Uh@ch^IuwG;6Ej{vOOuEk29dZO7S4YBt1d6GT73l7R_vcuiwuy1Y#tnkYe z8)sgG(NDr?;ksqQ!!|Y4Fpy!#vkEjSXEn%n4}|9nj+3*VHyh8ifKI(-ab`f7SpCQt zy3Eetq$*oB4O77x?}wt{jR#`T+$mzevTj&>;W1grXYi3CJFI)Uoo7TT^5%ELA@Qvl z_xZF+RKGMDR+?*~!I@ihbgVw#Z|uVoodODw=4cw%J!=*WapnyZhY5KpbI8rOmASP@6;zCI}3aII!O!GS3fEjOZQf%icF1mh&eP^q< zkL%vb(_BB|!RomRiVBw>|l0&vUf9ekKh&wVw?9b7@SwDUNvJij!<@`EThd zYMcIz8vXNyZ<)RDXnQ7^zMT)DKK*Fp>n`Xi+lxcGAuptbVBI4FU5-2v4Gt@DcCaHZ z4>V+dTxUpjrh8!J^p}F)LPN3t=PF^E zxAgmJbK#05?Rb84FYf5q1anMxh>5U{rY82G{rx^l#GJ7t>++TqhRCwPl2{nIPn!?4 z-=OWCt)biNc3Ab|3&nU0z`aS1Z1yz?&s+Dw4Xy>^uOFUl;H%HQRrK-9%xJ#0@d4y} zkA*>B5_tK;3%D%iQ;m%0HwskH;IALgk&MDnaP#TMkM2%@&Q=-Vn}19g_@%IFpr<@3 zC@rK3iRrN2(g?JZ*VC_xdE(bY7s+H%B(*PHL(`wFA`8z)bfiHY4PV}Y#p-tKf5DTZ zM;7wbt?A@DA_F(%A@9j=z`QQC(!Tp#Shr#_PT7=#gQlbd58NyH^i!_3WUekX4O7G= zlh)wffeP?Qqn54he1&6Hd8{9thSgt&@NG#F?7ZHBGm}qB`$iw=z?V$?nX>~9ExJR= z8)n0g+)JW!h$00_eb=`=_SCGE0b4#42qV@ipwV_E)>pm)?G_jDZP%&nH>8Aj-%e)T zM_0uhdk>Dtw}6sl1^0v@3bbg0GW@ulMoXLt6Wb4gY{6unaJ-lo4)({)92=;+-i?fh zCSl^f6qXO!j8UvC?Q2)y;v0<|KIe>RnQe^k6OXXsx=Yy0t*U1GuSC}U@=J70{!Xv2 zYw=OtXo%WUK^{3(;B-S*uW>MxT^PeNr8>dO1E#`z#o436bWTeBG$zVi?rA`(b> znIZog_MF@eav&pZxLd~YQt%3MWt$@($!pCdNkVBn8VvEqZ-JrYG*F%&7~f&{WXYZznRJnXC`sCw8K>NHW?0_PUhA3u9NDrQZ9U~%C?2F zw78;8csu7fs1!dWnfLB2YZuP9M_+@Pe+QF+?j2FkJtX>ee=ST^_JmcV8>;cxb?_c_ zl=$})$}%c|{xfe2^1qV!dD%KP+ZoIU*X9Z)4>CbG(TRS3-pX(H%kr!y8!>t5d(isU z7p=x*kip8S@K?nV75b#oywMtbNp3rTQ@lxPO9;&Jw&2YfOCe`SIN$j;nT1VoFtlT`AWeFy|~b!&6>?tLAtLnWxTwCZ+RuyYbv!`$xQ&KN&7>w`2PWd0bd*O-?z> zxpQa(1iTw6-8nB%?uty-pJc$*gAS7Y23bjZ`XpGo)E!QJu;9B#Ss1r21y)~QCiNtX zA<8X482RHASs%Sk7jDnS_5q{un$ZceUbcca)aan!)gUnNs1rjkHA4FL2;#9@DPYtb z64s7}&jC{OqwXp&b~53uPc(Vq_C11OM5|z7eud&EzZS0R%5z@yES_JG!M;%{*wI=D zzpno#gD1MIt>M6p5nEvJqX}4dce7|X7RA8qo_zgd57PQNTin-ElT@yBLb+%0IHb2D zq^e)wN7;oGzTN?BQ1I$za(Sx@$7grK(`g1Y*sPXV zuT`???j9jMat>av)e)}^vu7*Og|!<=5XA26*)2qD`lC#jlgnwp`*dEcpTMWAfS-(< zL2X;kK-1XkU{dS}rY;&WSm@Dc}5@aC7S| ziK&bR5A=Tl`W7+L4oe0d^_OGqCG}#>)5&;bL>yL@N3iO`PNMmfVSMgsCAl7V<}EKq za%}W=PCe2}^|L>dVx0}x8VbU*v?@xOXvwK7EqTeQTDswR8D4aoi{@HoxIe>;BfOu9 zH{Kax&2?qa_~ws`$K8i*E^?Thd0X)P0qi<)BByB^v9-!j9wb~Lo4e_B?~^M(Xqmxf z4_8uD_!xFwP(e#?EAhqmpD5JUfd3w7ge%uS!^AfcSY!7}?67bGk5N5wZeI~(+Gb(d zo(l2Xa8Go5y%9Fd7zX#Z^q_Us9wZYnn}bz!sjS=@M*Y}MZHH5M`MGMU(s5vk)w)2qYZANB2_m?5m}i$M|cg0aB>S5z}4RLC~BJt|Sfo!Z9MRVnMQTmUm zoFdhKlBZnl!u{Jh=4KbZtsEvy8>T|ZYnRfm=m>WF(uaqJzo&uYtLeQ`3heh@z>Urs z;{GY^!pvhK*eAxG&5rKCVkxTdhCvLgJhbC1w@`jGXe6rt>Vv+0mxJB}dpa4B!MTlT zurqc8UTrjht>bKQ`sF=5CNr7GAJ3!hI>R{5XdSGZSs)CH-9VZ1rqbGxw?*%G54Q1O z*nV@6_;SC%@Z!6erLdQ-kNgN#I_mI6iuee3?I5i|CvZdVR&3j5jvJS}ht=}^(R$%z z^geb$T)QHOu9Ovt{vCJVXWSdoc;5_mT@wkmzN5~fw4(bukH#ABfHK9Mg5B~va5AU_ z)OAd7YnM%2Q0>Y7=4jD;EO}APGjHTTn@N!fmb*vujunJc{dz7otI3`DI(j_B-r3{H+E;XD=4k9X3jwF*flYd?)2lT3c; zGW^wSKY5&(B`nwI0#7$t@!j~PEHNiOcf*B0)o5VKsjC3BjkH|flxx>q;={T{Fw_4X zJp4TrI<*Gk+LbpUMsssbg3Tg{)%Za?cu@@`?mkAU(yX+oz>=@}cBfLup`?=@L<@gQ zyI74!gyp;Cc))OPJ}@kkCyzP>rDta0?NMhy{ZcRLxjTm4=V)?-n+1ITFaclKuK~N? zTd`$MCT%odM4q#!i+ROiIKAX6T#}Eciw=dfx;aNe|A-4z^4U;JgN>c1acF-_jvlv~ zo5R*|RntTUwN2EtcNoh&Tg?u$QaPj!neWX&i{t~sh&MrQCSC)Dtfo{5{Gm#^ae=J6 zI)D`x1o7awUE=vizMzqrFDccPqHJ0h;Eqp8BriV!rf~*7`+5j}gpI~|be(&4Ue4+r zH%K>aAf4WEU#Rr2fJKerth91A7Yw-}QPIy6zqstjo{~T~vb&kGZqB8@1{S>jg^9S< zJ%q(K6|jHEMGCyE&d-MXlcAqEPo1$BtUk^M!-F%)LW(ff5M`wMWOponqa_9N7mGAK=By5A82=acZ}*|#?(8<}=iClqt9yH-=r$w1UXv+gDBlvg`>$uk$)@~1;4R@BYi`yXDn*FZ z(96*WIsV-~I`dMGx712A)!Za1JA4bQlNF&n&VugFwxyGA^_UhRi2Xddm-ac@g7?JQ zN)0jjwuJ9$DMH%aDc}@46QTpFAkAh2Rd)1(@5<|hQ)i{1{Ws+>OKT(e_-%q?X~VI- ziwbTw-b)LTL^94kASSA$fc2DgA@am1-cfm;PwgHL6Cdm2!T3b+>N$6E?(4z*Z<|o> zx;ly&u!tZ2c}yC+JILskH{W&~%O_-yQ}?Q8)SfhgpB#Hh(+w9=O5ckRaA;`FP50AK zI3t^rqq4c+!#yZ_X@lx2rc$)nC5q6SNrz+mvP?=VTsb4p#wHQGTsTXM=?XP}ZYJCL z)wI|DBAwkQ%ioG~=yIAWt-I^b2li<2e480u*Q1Q07oHK8Ra@ZqeLeYvPBhQbap!Xn zOQ7!YZ;A4v!CY~_E4k!$;nLR<>8TKXxX`keC$6yOm0y8_*3IFvHO_R-I*1KSR#8LU z9dY;9DmwCJt*G#FJSQmIvGXfmN?mAy)0f_)0#V>`k;SyJ@T_p>Y7MBcAOM<(MzGF*8`Hz+(A`s zt&*U)FXE_pe^isb25q6sKzMzdnsf`G@5^+zfmc1zHDEq$?;FDVFSODyUn3lA+>K7R zO`?%WGssQS7uTvd;N!mQNiEBSqrJ|-%L9kVu*eX+D!eHrg?9BQpyl!(L`#)3 zn2GNpza~M@$s5Qc&DM%bG;YC5|2D~?5wWyqYi>|5xz>4lmp?vaABUb`T(zb|cf)Z?vU8q3A_@#*k+Y^LFf zN>wUoe#D4Vw@u~NfQ_7dXFat=hR_iIwGe4&i>~dxFs{;-4u)8>;M9{PTMyB#jhf<; zQz_(gtDWNH^H_4G3l~dr=&0{TYTDuf;dN_a-ANSotUm8{(mn&%Pq>aXZ%VLx+&E}| zSxxCHs%gRNA_{5VPTOx7v5m_b`k5_7wi&GzCeO(bP9)u+hS`QJb3l!6>rNFlY+J;; z-@e27za#klgd1Y2;#J|#r3JL3V}|6a?b5gv)#a?jxj=?ryVk?SSr+J+c$)qe zr?LGLf3At0Dwud1!-v*dvQh5I7yIQ>llwio-=;#!rrqHCgA+7t?S5MDP=+to8?s^K zQ^~>bV7Pm;RM5ib@FYNr4C`Y6*L>3;^W;Op*kPKW*yjnI{xg^Ro|eFq^nLK_?hL*s zbCtH3w~*Y7^HiGnOWd<(5Sl%7f>W#WMZa`4cJ#34w2%d8H z$kuA0^w_)A0JOUHJY}XAU~J2sUlA5OQbF zfFJVVc%ZO1R1Z(CF?urGO=)QkYR7yQHjPrJ!}9~U#$ggKZ@S8ha0HLqIEm%emDuw0 zboPx*;PAi>B@uT+tUWS%BbUMlVFh6h52lgBdhlujV@A@ z^wFm#d~9?AudQ^3oBr0Aw6PF%lHG(ogW{lQp%MGaJraJ~0R_f25I9zW-m#zL^k4>! z^u9=AmOK|XHP%z$y4N)-&Ms2)P7?X1JrskEWwQHH7tWQQ7WDnHOvsU+%GI^&#l_*> zd2)gcPpFmWkJ*{5plL;+VP^Po`ZJjGxd;X%4(CH1$GH5II*y(kiid)~35VCai9sJ` zahxPU95%QwwoUDh+0Wv^a6k{~{Qg3CfhHVZa!njkeO45=x{CWhTf*eB4vF3MJMgU4 zhIJks50c$%!q!C{a6CT*n}(>06|O15 z^v}v@W|{$8rKtXkT3(>{zzkc)*y6YzA4oY?i_;cnQ)A+9$XR-cRQ#1`k#R1C3?EH> zLnrdc6+L($#e2=2q%@XDdAX-XESs$5FcVBu+Q%&Sg#`VPMl#VZfEnd}*!>4x3T~wD7B3uNA-fqX`K;klK|#XW6V@WI`ge~k_ow*T%Xp1j+{;l|bM zD0e{!*xOF~*AC~%?TdMn>?R(Ra)zS+Z028fA$+x8GOf#~C8s(?Uf*pynHsMYO{S@E z(3eY)x?A8RH>qA+G|=8!iNltZ3hy$92>Z{Z3s;x_rm<-bY5WePH;?D4y@Z5hA3#QWC(a)RY^-^HMq6Trr zrVu{$U72HDgYiwtW?Wu&k>}6d>E^iH0)IDj!^PVDIBrcipFQ-G)<~gwX1~fQ=S3_p z=o`dcp^duw6p~f)FJ8VhgW`bCA8_HCygF^MoJ z+Z$hcZxGhZeolYppP8?b;O%%U0tZzXLVz{#H`vJzG{faUIpP_JhNVBQ-fM`tkm+PW);>A`g%D zpfNC`6@F$xa$xPyO<{UHYVb%=9@T0&WY6@^x|j@ z18Jkv7u-+x1?XDL#lkM}_@zTI>$;0@v(AAUcH9s?KNG>`ikj$>S0f%$dPI$DhQRKI zL$If31(kfhMsKf1bByH@A^zr8eEe1!%<62p$EW+SAW$2-^fbeDI#=kD$xJK^*#-gr zU7+hU9WlLrDVnGD2ZJZd7!>)AzK-_*Dw5$&8x{HSmSo&=S`YU>3=`h=b>%5;zi9T; zN$9m!5qCUa!S&a+lkvgZu&()#^mL#R9;tNU8*bZRQ-dYF&fAHRBNp%ksgIWQEfp!fc>1#bD4*E#|7q;X zynoDR{+R1rb7s!WnK{?>{a#;|wVr}K-y)d$ zd(6Ec8*==dDYJ6TTUcf#OlJ!8Ft2@!Nb3$$%x;gu9r8EGEZsNYYqkm%yrNNRxHA

o@PU7;m zYK*CT#LgA51o}gq)=Z8g<&NFt`+kPz2D#A|_c7?XR0}bA+b{2bU+mCz0~mVeIC3x zV1YThdF*E?BoQHpnS$4zXf&PAOm`Ir6%Hf)b!#$&aT%<6%EPJfyGGJHYZ8A-vNRgZ zy98e^s8Dh~mrM$X0;8+LK!~_v!H;m_x?KZ1=gZLQWO*X4l8Q<9^T?8=BBZoEld(-7 z1R1@9FzJ{+Z7(#VYZcU}n}<1;7-f)(P&vACn-;5c+8ak_I)YeN@mWU}vN9Z|9~z&TxS`207S zc-J=s{d{b3ox%=?*bz$>2jya}$wXWdvJ5BgZh$4L&w<0>JZimA%=+_Pawnk{;| zgzoTaqLMo!Ny%4h5;S`wS{16$__!Dn7DK24H?Le1+{5bO12{iRiLU>eM%N5HA=(9x zSysQC9SDwxh^`=NIjBYjU)rF1QZifpu8s8BC*TUxqp;evjaVI;3a27tQRAi+-aq$* zut_?Yveb+m<Ov&M)*r{xQ_`AS1W#WQ@t+;rX1`eEDh@Kgv8I>w~QhrCB z)%i9O^Ky*w@kKYluU*V-nNjS=A10V(e-_rBx(0CY2rH-`4aFCI`6p%V(NX9#?~>4b zlE1-*W)?P3$;l$LR%;CnNll`GveRhE=R@%GXE7Vq*-V~oZs%ulRTi#GrqKi$Wyld) zPA4{6(YI%{Fi|y*ytlgt$}tpYC#FM*w;>$KD{ED|QN%v`b*OHU+*Q_1FcU78=TViWW_I(aDRj5iB)Z$!pX_vA zM-yV^5;al;_mlGZN!MCIgj?_PVuW#=$yG*90m-SHGq5Ak53khpfR1Guc`;lOr2{ij zbbT-gsJl~3X#qBH?r54MC(bO?Na3E*cF;ZK0E~YuD>LOCasO&VMpT~yzS0hSBb<++ zonGj5H=Z4Sp%oVkmf|p}9^5zV8Lm9EAA9sF@hoBS%Ce(4-M1NZMtmmUOkMCmU@gRt zTg7&z9D|M}a_~v#Jh%iehhh3BiH?{oI%&4R9sg03s_4>1=64~_q!K!XUGT2!RaW>* z9K14fC83gcNwfJFdX{I8W18>6V7@Xv^!+ON>ekAPv)O<U3)QZhkUeYssm~J`x=6`}r|;DUtjloP6Jg58zuQQSbV@OJ$ysnasf44A z{IP6L22;T0J)$%C_@THJ9o;`did`A&H|q=eP;Ekw*^MRV)GcuYdy%);f{T48^^nsc zcj=h*Pk^>J6Uk4S$gYy33RwrK<*BpGQ5ki7-IK@9v=^qWPhQiveQBg*r!bO8ar9oj zgjZtkPS<`@VxB78CSvzHsB$QF z|IBJJ4k$Nn2wX4M;-_0(Q0Q|9RK*eYPE@7CtkhA2T_XFDvYDp7Gl&P3;$-CcJH){1=&r(do=AeC{EDq0767TT%}* zXyy&eC6n>!T1ObrsRs4j`_TR*9U_lGb8d zQLs`?i(1*}Gwo_AWSrp6{iJ1%F&Vi!hMyE1 z!9P77#hg$qOxv^-)wbj%Sy;g~aORk;kGV z!98yU8>QRp_B&d9%7#C-vhqQr@`|Ox8^Q)fhUka zTsefwckd$p%a@T%f}1HgE^Q(EI@jRNFPGqq#z8oICWPczT2S4&MeL#Q`DEGf91DKh#7~uS#TS!?<52rKfSU2VE>#WY+HfZ#G&+{rgbyRP@)F>N z{%Q!=YDdi%x-q$C;@I}Z6?dmUgH8`ElvZ?w{Ky^n&}2Mz&=7LTVj47A<+387lh8J- zmUr?)7A(;*z@-t@FeIvv+g?6{pb4X3tepWJ|7!uw8nc!xSSAhjMa?ko{zJ0t2bVV& zodyTa+T&s0Zjkw6j{_#c_|9Gjcdbmqk()Z9=8Q7G>9ROnjy7NgeRXilYjN;B?8JCP z2omwkH{_u2LD-`s3*{G|vp0uJ(s%h${HhaQq3g~pp2kQQ;>$>a(bj129=ry}NCE8I zaUG^+*0JCBh>#Yy9PrADB!U$Hb=K>!LCzHChV;XInI2;5f73$7WD?%YlSe1J({R_b z9<&2$LAN0j+729n#I6f$claB2^WP|)|&^QBxYlori=p2ywu^TE(2B5iUGptyb zNkqKMz(_(HcQF^qT-$rhpxy^|DDgE(G_7D2w!DkDO>`(ZF@+m@%IGcypW_TM>z0gR+>z>g)MknI-Hz6*b6Z&yZq` zZ?K_15pE`DL$~llUW4~9a?4Es=BFoP%jarxV^}@W-an7jHo4J*&-YQkR6Vk-XcBr| zs9`=eO2N~Gxg_228{_z;8@8$oUmTwtQdYT*0kmH6y|F@AJ81sSbH5P9PUqjlW{C&}jG1HU-jX)zzC z9uUBt-y7Kd`^I1cUl|{sy#tlux5-*VEzkQbb<{r>Po^DZo+Jhnd}HtCo=xkXgt1j39;C4g(!{{#YnAtOoVYUvvn|o zY%?At&p+zI{E36)&e0S`%dr7I$QY4VE7i!{84scD%z9S5sEhfLak$QS-w5Qw{@Cat zL#2cE!D*{$bne0h)Hvr0fBl3i;{8yDI4dSGUI$k)mb3N9y5bNJ&GKby1g=BxL+%64 z^MTUqo}_}S!W5M@0KJ-mxTQ}3XFPpOHq_LTwVqNtW=5ZCnnzvtqOG9($ z?Dw|Rz+NBr-n1qcmM!9tfmx6zUI;^L9z#lwFo&Bt#CmU=k1r(z@Vc!#I1q6TPY{h4 z2W{bd{AjW#UkCSoutmk?q39shK<053aMr~d(UNy+^k!{5;Ta|~I_tx6#NIvR=7SQV zd|ru56syo>wT2kwcMbw?x$+J_vLO>TsE{?=PD7Ql2yxjy1vC18f#I94gsfIz3r+#g zy?7oTFHRvpy)*d!+sgs9rjar0y%@`GJ+ewJ7h*4%5%1?Yq$Mtr|6r*(V-c{1^897V zv=Q<+jEKuO?9ugYJln6u~4!!g*o6npEq098jEhk zP>+CbytGB39L|M1Z-Emz;ND5J#Z>XQKpHl12)gd%`>e#=<>+oFjUoE;uzHa+CN3=m z6?;3Zc_NPYMkrzasul3KKpFcMO(VC$gGhNBl3ccSegcKUI{HuxnNJKCbQh>7dyA5931Rc!F;U&M)sL9`YsG(y#}U`{2y0g zSjlcOF;o(39P&VG<5n`fyNB^kKT4eLE~2G#e(}b9HAQieQEc{9A67yp2o8t&V`;V- zB##;a6~Sh>+a{hD{4o;R4bPKtk{duVjzgq(34wd!UUvIve`fMJJMzNE9HcVDF-OJ_ z?3+d5Y_J|I_UvX}rEa3p%VuFva16h@Rf8@$JQYh7Z1{2$H?n(|&!CnW(qyM*98A^m zAa^?>IQHBaDl+yZSh;aH6E418T9yF2zXae%uY*u8_nk5CnT^h$mf)E~@8NUp69}Ga zPE(a+PR1@g4+h_DaKYgZaBlf1Y`nmIlLF2_dfI+Q`_X0gXVp!ZR^fpWa`hz0 zZVY&@a7P<`M{XTziYEHqAo4+nJ}ooHew`)gxjc}3#C`Chy@jVF~hA!3kEwpzu~Eq*$*rpgiWpYTCz{0d+;|AYx^0`Z@0oE2r<0iXmW0|=0Ls=2@cP21A4MS8?5yBCk zn!Hs)La6)q!mqZO&^4$)&jd~*oiD23o|+DJ%g8}y-BaGUN;mW~tbp*v z=h(3k^|12J9uU@(!4TPdpy&LNv}je3MvqWtgQ6IoiO~b`Gvf5J)B~nOXCf23e}IHI z4iKT^2{1Ts26?%pi?rX&Aj%a^#KWo-EO&N+NADcAu5SQvel^&?m_YmV1khou2ECqo z8*HDZ;>&I)v^uOv{q}tVoB89oyiPFkX2|1>)?#8e=^9VIbBNi;v4LLHIbhZjCD^!E z6Af&g@m}c>v=<6R1*z{$|C3yF)-Oe+#iud$X*xM^a}F&WZ-5)P|3U8yzPO}&KW!az zosRzA4+)AUr1xYIRWqrjD(aC~(*BUB=|5%5gPriX`Ye=ijKXR9m&n5tYpB72G%~zy zh%C(iLQL#6>E;oAOwEyMc=&KKy&gV}ro{;$`^X*jmb$`ErxqeTMHcVRR-!N1e5lYx zSaf%Yo%)u;uV#7Cc9~6Nb5{XToH+wpbKkSQa-$(V_9~kr_#PhHX%T!N2*>+E@x&1+ z=vBT1x=-VYqjn;@Ytl`!yXPM8bhtS(Mh*w~d$7MSkIwL#2+7C2F|bY#F6nfywD0J59OgvSeSu&EA{sJLnw zG1@bgT7B~&u|oaKa%lys?sAPddF-O2^2Ldng9NkGBbg3Y^rL<F8s z(&m|qDe1oqKdmC6mp_h;l(fXN>5j;1Gw8YgDcF6;W~9E!(NFgJbVafPIPbp$PA;K9 z!3>%OftRpp7_h3+h%-F!(~Lw^)Ud)EB26&SM4$L>PPTLvuZ7x+r=js!1$ilWnmtG_ z!jM%r&!NeXnC_Fsy9wrazta|7xr$L~q5VWDmSc;)IRuX#_Ttn9O^|qLD}8RnqtkC@ zvR}tVpu>$I*1f))JZsOu2*EU#NwkG9RXv=xEgl@=(#g&^3jL-}V4r#v?^Drn@*>=T zNNCzn!}^)HSWX2k+^(?uFMHxW?jA1;^e37%FJRs~Q`#S&1y0YVlj_Gl*fsGgT)RW4 zfDpkeW+iZ^>>+#Y)+2cN<0c%SUPSkjCTf3(g2CP^#OX^f^I+U1E9Z@|R6%z+xi%^Y z>w*vSbbc9Q?l&iD9GFP5R>fM4aNkM3I1hnQz5;4js^I-V235Gqa;?j((QmIYiZ0&8 zdwWJ2lT7?zm@l7cyyy>mcbhU>dA~rfUI!ADz3C*UD`Zi38Mz@7!L4!b!-F9!E5%p) zsn@D{a_biZax=;qzlK7NC0g|PTqkgj*eOBuVMO5!~n zJXb`XwW$%^)~%TObqrRw$brnKUFfLV%1$&CBhj~%P_A+e3F;`o>9w+S>kU7eyrZ6d zveg;YM1?VMlM!9fIfXtce#nlj*#O1CYILo)8NKYX0u=Nda82TC(0Fs3jP;Br7<`6N z@o<3N+}-fRS{p3{Gbs&iAoq87@RiGI3ZIYpX!zm5R8#C0fd%;jP<;$UV)31n)_ zV@9w1!4!?RK*wP_nWm|(Wb1_vXgXtsWsEaO9qA?f=V55YRRb-$6OEQ4f|w)sl-%cX zBJIO@==kb8=*fSF_LufpmUjq?gZQv%{uKc~t+{)# zC#npNWn|E@;bgubMud7EL*003bQV@Vqd5vj2ZKdy%MN{<|9(@gwRO*wm*#2 z#lPo01TQp}T>x(^cI|B!*ni@zQ=CZKn#i5BzYPu z&5{6nYb>3mOsjY4)5Rb{)oV4GjiowR8&F3(qvud7#q%VbS<085qKpD{-*~!Qo;Gs% z2GYDNiLCydgPU{1vFBkB(>fqZUw?9~hM!w*l zS9Oi==c9FFBW&9@A6NFwC5P2zxxXcUM#Z)h?EY98UB6a@qlO6lgB+44qTDIRb{q&n zEjz*bK8qL_Yc59{5=Yi&FZhCvn$8@BaGG@|SDs__k5^EsCK zYM}&Q+S(;;r1gTg8R$KrUf*A-L9&BaP|F+sczbRydV8m1xsMj)xB1iYUcYG6yEZEM za0c!sO=`1I4c|;FLcJT~aMsN;bmUc}cJt)w^R_qPw2mp5 z8?}lm`jyhwuZ>tW7>Fl|cY$3>5c}&%Doqk?WM7NiBWv3u=@_e8`tG@r_2jTJIvCMP z8L=k1zPOlnwr!!}`rSA`B^!KxD&VrZDY*XRb}SUwT3=;sUO)S?c)imrjr#T3uTZzq z1#>G(aYEfGT#=xJ3D!}dV{rhN8gmrkV+RVqH80~SrTVS3o46fTvHXDqV|#TryevU z=|kJ=A*pWOB-quu>hPnYqH-=@h-O)&X6Ut+%ql^c@$#g3eOh za@ig`#`=)Kep#Ag$D^{dCF@51cd);}-Q(d4k<6c;^(*vd_0uS_=GQXWDpO}pu*V@s*S(aU71m5jCWejhAx@4}aF z7vTG}Iy!yP0QKPC##IgND6oA8CIz?91=*QYAh-)}XJw<;qBCI1(a1WyyU55;Bh-

#@kY=I4PB+Efc4VJ^|ei)vTj|DE2q;iHH7LTG>-U&k0VT!Ho^{V~qp- z?we1?8Od3DmVbt@6OJ^M6{SP<_TmEEt9-M8V9}l;N^5@e4IY4KA0~LE&+C^ z_WB{lUrj~TcXnv^2ZadzsegNQ1UOdJ*fMTXs~PhcHy`Jd5~&0>-5?fRUW;IjT@iCc ztRMc+ZmPZ~$Hx*7``yk)KBO29etQfj*2tpe6*u%)XO7RZ7ozcSaTH$Th`I}Q<5JTY zJl&TK^+&y!#Ya;4oeNq}RLL9*=a2r=%WR=PNlZ{y)JjMoLeTF2Q6^ts_wVKrf)aNB zO;*&HnC4^N(g0fAI9Lo_QPnzk~)PB*er-<%e^Q1-*6u zRKwro-%kG9=JzJyJ>kELnEcV@=s&qMm6w)~ Date: Mon, 29 Mar 2021 10:14:29 -0600 Subject: [PATCH 80/84] Make Autopad static when available (#7755) --- python/tvm/relay/frontend/onnx.py | 3 +-- tests/python/frontend/onnx/test_forward.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index d9fc2ff99a765..01c22d0f9fab0 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -397,7 +397,7 @@ def autopad(data, strides, kernel_shape, dilations, ndim, pad_type="constant", d # pad N and C with zeros pad = _op.concatenate([_op.const(np.zeros([2, 2], dtype="int64"), dtype="int64"), pad], axis=0) - return _op.nn.pad(data, pad, _op.const(0.0), pad_type) + return _op.nn.pad(data, fold_constant(pad), _op.const(0.0), pad_type) class Conv(OnnxOpConverter): @@ -809,7 +809,6 @@ def _impl_v11(cls, inputs, attr, params): pad_width_expr = fold_constant(_op.transpose(_op.reshape(pads, (2, -1)))) pad_mode = attr.get("mode", b"constant").decode("utf-8") - if not pad_mode in ["constant", "edge", "reflect"]: raise tvm.error.OpAttributeInvalid( "Value " + pad_mode + ' in attribute "mode" is invalid for operator Pad.' diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 9116b09764091..772d834bbe273 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -2703,7 +2703,7 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p ) model = helper.make_model(graph, producer_name="pooling_test") - verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True) + verify_with_ort(model, [x_shape], [out_shape], use_vm=False, convert_to_static=True) @tvm.testing.uses_gpu From e8752c95ffe24f79a68d764c8815de3af8feeed9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 29 Mar 2021 12:39:22 -0400 Subject: [PATCH 81/84] [VTA] Make more explicit error message during sim lib loading failures. (#7761) --- vta/python/vta/testing/simulator.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py index 7f2471c615321..05f37c182815b 100644 --- a/vta/python/vta/testing/simulator.py +++ b/vta/python/vta/testing/simulator.py @@ -17,6 +17,7 @@ """Utilities to start simulator.""" import ctypes import json +import warnings import tvm from ..environment import get_env from ..libinfo import find_libvta @@ -27,25 +28,30 @@ def _load_sw(): env = get_env() lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" + require_sim = env.TARGET in ("sim", "tsim") + libs = [] # Load driver library - lib_driver = find_libvta(lib_driver_name, optional=True) - assert lib_driver + lib_driver = find_libvta(lib_driver_name, optional=(not require_sim)) + + if not lib_driver: + return [] + try: libs = [ctypes.CDLL(lib_driver[0], ctypes.RTLD_GLOBAL)] - except OSError: + except OSError as err: + if require_sim: + raise err + warnings.warn("Error when loading VTA driver {}: {}".format(lib_driver[0], err)) return [] if env.TARGET == "tsim": lib_hw = find_libvta("libvta_hw", optional=True) assert lib_hw # make sure to make in ${VTA_HW_PATH}/hardware/chisel - try: - f = tvm.get_global_func("vta.tsim.init") - m = tvm.runtime.load_module(lib_hw[0], "vta-tsim") - f(m) - return lib_hw - except OSError: - return [] + f = tvm.get_global_func("vta.tsim.init") + m = tvm.runtime.load_module(lib_hw[0], "vta-tsim") + f(m) + return lib_hw return libs From af6f1674495016a934e9854f283729eebd95ecdc Mon Sep 17 00:00:00 2001 From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Date: Tue, 30 Mar 2021 02:36:28 +0800 Subject: [PATCH 82/84] [ARITH] normalize iter affine map expr to PrimExpr (#7759) --- python/tvm/arith/__init__.py | 2 +- python/tvm/arith/iter_affine_map.py | 16 +++++ src/arith/iter_affine_map.cc | 58 +++++++++++++++++++ .../analysis}/expr_complexity.cc | 0 .../unittest/test_arith_iter_affine_map.py | 21 +++++++ 5 files changed, 96 insertions(+), 1 deletion(-) rename src/{arith => tir/analysis}/expr_complexity.cc (100%) diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py index 77ec869a171eb..05843ede9284b 100644 --- a/python/tvm/arith/__init__.py +++ b/python/tvm/arith/__init__.py @@ -22,4 +22,4 @@ from .pattern import detect_linear_equation, detect_clip_bound from .int_solver import solve_linear_equations, solve_linear_inequalities from .iter_affine_map import IterMapExpr, IterMark, IterSplitExpr, IterSumExpr -from .iter_affine_map import detect_iter_map +from .iter_affine_map import detect_iter_map, normalize_iter_map_to_expr diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py index 4033d797dff89..5aa817bd7a24e 100644 --- a/python/tvm/arith/iter_affine_map.py +++ b/python/tvm/arith/iter_affine_map.py @@ -112,3 +112,19 @@ def detect_iter_map(indices, input_iters, predicate=True, require_bijective=Fals Empty array if no match can be found. """ return _ffi_api.DetectIterMap(indices, input_iters, predicate, require_bijective) + + +def normalize_iter_map_to_expr(expr): + """Given an IterMapExpr, transform it to normal PrimExpr + + Parameters + ---------- + expr : IterMapExpr + the input IterMapExpr + + Returns + ------- + result : PrimExpr + the corresponding normal PrimExpr + """ + return _ffi_api.NormalizeIterMapToExpr(expr) diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc index 3757b5eb0d51e..a49478a436355 100644 --- a/src/arith/iter_affine_map.cc +++ b/src/arith/iter_affine_map.cc @@ -1028,5 +1028,63 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) { } } +/*! * \brief Given an IterVarMapExpr, transform it to normal PrimExpr. */ +class IterMapToExprNormalizer { + public: + explicit IterMapToExprNormalizer(Analyzer* analyzer) : analyzer_(analyzer) {} + + PrimExpr Convert(const IterMapExpr& expr) { + if (const auto* op = expr.as()) { + return ConvertIterSplitExpr(GetRef(op)); + } else if (const auto* op = expr.as()) { + return ConvertIterSumExpr(GetRef(op)); + } else { + ICHECK(expr.defined()); + LOG(FATAL) << "Unknown IterMapExpr type " << expr->GetTypeKey(); + return 0; + } + } + + PrimExpr ConvertIterSumExpr(const IterSumExpr& expr) { + PrimExpr res = 0; + for (const IterSplitExpr& arg : expr->args) { + res += ConvertIterSplitExpr(arg); + } + res += expr->base; + return res; + } + + PrimExpr ConvertIterSplitExpr(const IterSplitExpr& expr) { + PrimExpr source; + if (const auto* op = expr->source->source.as()) { + source = GetRef(op); + } else if (const auto* op = expr->source->source.as()) { + source = ConvertIterSumExpr(GetRef(op)); + } else { + LOG(FATAL) << "Unexpected source of IterSplitExpr"; + } + if (analyzer_->CanProve(expr->extent == expr->source->extent) && is_one(expr->lower_factor)) { + return source * expr->scale; + } else if (analyzer_->CanProve(expr->source->extent == expr->lower_factor * expr->extent)) { + return floordiv(source, expr->lower_factor) * expr->scale; + } else { + return floormod(floordiv(source, expr->lower_factor), expr->extent) * expr->scale; + } + } + + private: + Analyzer* analyzer_; +}; + +PrimExpr NormalizeIterMapToExpr(const IterMapExpr& expr) { + arith::Analyzer analyzer; + IterMapToExprNormalizer normalizer(&analyzer); + return normalizer.Convert(expr); +} + +TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed([](const IterMapExpr& expr) { + return NormalizeIterMapToExpr(expr); +}); + } // namespace arith } // namespace tvm diff --git a/src/arith/expr_complexity.cc b/src/tir/analysis/expr_complexity.cc similarity index 100% rename from src/arith/expr_complexity.cc rename to src/tir/analysis/expr_complexity.cc diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py index ac05809449bd0..5ce68aaaf51b6 100644 --- a/tests/python/unittest/test_arith_iter_affine_map.py +++ b/tests/python/unittest/test_arith_iter_affine_map.py @@ -285,9 +285,30 @@ def test_predicate(): assert len(res) == 0 +def test_normalize_iter_map_to_expr(): + fld = tvm.tir.floordiv + flm = tvm.tir.floormod + + x = tvm.tir.Var("x", "int32"), 10 + y = tvm.tir.Var("y", "int32"), 9 + + xo, xi = isplit(x, 5) + yo, yi = isplit(y, 3) + z = ifuse([yo, xo, yi]) + + res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y])) + + tvm.ir.assert_structural_equal( + tvm.arith.normalize_iter_map_to_expr(res[0]), + fld(y[0], 3) * 6 + fld(x[0], 5) * 3 + flm(y[0], 3), + ) + tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res[1]), flm(x[0], 5)) + + if __name__ == "__main__": test_split() test_trivial() test_fuse() test_compound() test_predicate() + test_normalize_iter_map_to_expr() From 53d44efaabf83a92025748ec75d71db1875451bb Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Mon, 29 Mar 2021 15:46:13 -0700 Subject: [PATCH 83/84] [FIX] Fix android projects (#7764) src/runtime/logging.cc was missing from the runtime files list --- apps/android_camera/app/src/main/jni/tvm_runtime.h | 1 + apps/android_deploy/app/src/main/jni/tvm_runtime.h | 1 + apps/android_rpc/app/src/main/jni/tvm_runtime.h | 1 + 3 files changed, 3 insertions(+) diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h index 1fd19b29e20bf..406effa34aaa7 100644 --- a/apps/android_camera/app/src/main/jni/tvm_runtime.h +++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h @@ -39,6 +39,7 @@ #include "../src/runtime/file_utils.cc" #include "../src/runtime/graph/graph_runtime.cc" #include "../src/runtime/library_module.cc" +#include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" #include "../src/runtime/ndarray.cc" #include "../src/runtime/object.cc" diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h index 6fe8c98756bb8..a89475440714e 100644 --- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -34,6 +34,7 @@ #include "../src/runtime/file_utils.cc" #include "../src/runtime/graph/graph_runtime.cc" #include "../src/runtime/library_module.cc" +#include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" #include "../src/runtime/ndarray.cc" #include "../src/runtime/object.cc" diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index 0745c30ffb1d4..9503f02f39ef0 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -40,6 +40,7 @@ #include "../src/runtime/graph/graph_runtime.cc" #include "../src/runtime/graph/graph_runtime_factory.cc" #include "../src/runtime/library_module.cc" +#include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" #include "../src/runtime/ndarray.cc" #include "../src/runtime/object.cc" From 9b43a646fc70f3d4a2d033c73380e0bae75f4586 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Mon, 29 Mar 2021 17:07:21 -0700 Subject: [PATCH 84/84] Rename GraphRuntime to GraphExecutor (#7653) --- CMakeLists.txt | 44 ++- .../Camera2BasicFragment.java | 28 +- .../app/src/main/jni/tvm_runtime.h | 2 +- apps/android_camera/models/prepare_model.py | 2 +- .../apache/tvm/android/demo/MainActivity.java | 28 +- .../app/src/main/jni/tvm_runtime.h | 2 +- .../app/src/main/jni/tvm_runtime.h | 4 +- apps/benchmark/arm_cpu_imagenet_bench.py | 2 +- apps/benchmark/gpu_imagenet_bench.py | 2 +- apps/benchmark/mobile_gpu_imagenet_bench.py | 2 +- apps/bundle_deploy/Makefile | 12 +- apps/bundle_deploy/README.md | 8 +- apps/bundle_deploy/bundle.c | 32 +- apps/bundle_deploy/bundle.cc | 2 +- apps/bundle_deploy/bundle_static.c | 34 +- apps/bundle_deploy/runtime.cc | 2 +- apps/howto_deploy/cpp_deploy.cc | 8 +- apps/howto_deploy/tvm_runtime_pack.cc | 6 +- apps/ios_rpc/tests/ios_rpc_mobilenet.py | 4 +- apps/ios_rpc/tvmrpc/TVMRuntime.mm | 4 +- .../reference-vm/zephyr/rebuild-tvm.sh | 2 +- cmake/config.cmake | 14 +- cmake/modules/CUDA.cmake | 10 +- cmake/modules/LibInfo.cmake | 6 +- cmake/modules/StandaloneCrt.cmake | 6 +- cmake/modules/contrib/ArmComputeLib.cmake | 22 +- cmake/modules/contrib/TensorRT.cmake | 2 +- conda/recipe/bld.bat | 2 +- conda/recipe/build.sh | 2 +- docker/Dockerfile.demo_android | 2 +- docker/install/install_tvm_cpu.sh | 2 +- .../{graph_runtime.rst => graph_executor.rst} | 4 +- docs/api/python/index.rst | 2 +- docs/api/python/relay/backend.rst | 2 +- docs/deploy/arm_compute_lib.rst | 12 +- docs/deploy/bnns.rst | 4 +- docs/deploy/tensorrt.rst | 2 +- docs/deploy/vitis_ai.rst | 14 +- docs/dev/debugger.rst | 14 +- docs/dev/index.rst | 2 +- docs/dev/microtvm_design.rst | 22 +- docs/dev/virtual_machine.rst | 10 +- docs/install/from_source.rst | 2 +- golang/sample/complex.go | 10 +- golang/src/function_test.go | 2 +- golang/src/tvm_runtime_pack.cc | 4 +- include/tvm/runtime/crt/error_codes.h | 10 +- .../crt/{graph_runtime.h => graph_executor.h} | 54 +-- ...ntime_module.h => graph_executor_module.h} | 14 +- .../{GraphRuntime.java => GraphExecutor.java} | 6 +- .../org/apache/tvm/contrib/GraphModule.java | 4 +- ...untimeTest.java => GraphExecutorTest.java} | 8 +- ...raph_runtime.py => test_graph_executor.py} | 2 +- .../tvm/auto_scheduler/relay_integration.py | 6 +- python/tvm/autotvm/task/relay_integration.py | 8 +- ...raph_runtime.py => cuda_graph_executor.py} | 30 +- python/tvm/contrib/debugger/debug_executor.py | 239 +++++++++++++ python/tvm/contrib/debugger/debug_runtime.py | 228 +------------ python/tvm/contrib/graph_executor.py | 306 +++++++++++++++++ python/tvm/contrib/graph_runtime.py | 295 +--------------- python/tvm/driver/tvmc/compiler.py | 2 +- python/tvm/driver/tvmc/runner.py | 10 +- python/tvm/micro/__init__.py | 4 +- python/tvm/micro/model_library_format.py | 6 +- python/tvm/micro/session.py | 28 +- python/tvm/relay/analysis/analysis.py | 2 +- ...e_codegen.py => graph_executor_codegen.py} | 10 +- ...e_factory.py => graph_executor_factory.py} | 14 +- python/tvm/relay/build_module.py | 26 +- python/tvm/relay/frontend/common.py | 4 +- .../tvm/relay/op/contrib/arm_compute_lib.py | 2 +- python/tvm/relay/op/contrib/tensorrt.py | 2 +- python/tvm/relay/quantize/_calibrate.py | 4 +- rust/tvm-graph-rt/Cargo.toml | 2 +- rust/tvm-graph-rt/src/graph.rs | 2 +- rust/tvm/README.md | 2 +- rust/tvm/examples/resnet/src/build_resnet.py | 6 +- rust/tvm/src/lib.rs | 2 +- rust/tvm/src/runtime/graph_rt.rs | 20 +- src/relay/analysis/get_calibration_data.cc | 4 +- src/relay/backend/build_module.cc | 6 +- src/relay/backend/compile_engine.cc | 2 +- .../contrib/arm_compute_lib/codegen.cc | 6 +- src/relay/backend/contrib/tensorrt/codegen.cc | 12 +- ...e_codegen.cc => graph_executor_codegen.cc} | 26 +- src/relay/backend/graph_plan_memory.cc | 2 +- src/relay/transforms/partition_graph.cc | 2 +- .../contrib/arm_compute_lib/acl_runtime.cc | 8 +- .../contrib/tensorrt/tensorrt_runtime.cc | 4 +- src/runtime/crt/Makefile | 4 +- src/runtime/crt/common/crt_runtime_api.c | 2 +- .../graph_executor.c} | 321 +++++++++--------- .../load_json.c | 2 +- .../graph_executor_module.c} | 122 +++---- src/runtime/crt/host/main.cc | 10 +- .../graph_executor.h} | 66 ++-- .../load_json.h | 14 +- .../cuda_graph/graph_runtime_cuda_graph.cc | 37 +- .../debug/graph_executor_debug.cc} | 40 +-- .../graph_executor.cc} | 73 ++-- .../graph_executor.h} | 22 +- .../graph_executor_factory.cc} | 91 ++--- .../graph_executor_factory.h} | 50 +-- src/runtime/metadata_module.cc | 2 +- ...raph_runtime.cc => utvm_graph_executor.cc} | 16 +- ..._graph_runtime.h => utvm_graph_executor.h} | 16 +- src/runtime/micro/standalone/utvm_runtime.cc | 14 +- src/support/libinfo.cc | 18 +- src/target/metadata_module.cc | 2 +- tests/azure-pipelines/main.yml | 6 +- tests/cpp/build_module_test.cc | 8 +- tests/cpp/relay_build_module_test.cc | 2 +- tests/micro/test_runtime_micro_on_arm.py | 16 +- tests/micro/zephyr/test_zephyr.py | 6 +- .../test_arm_compute_lib/infrastructure.py | 4 +- .../contrib/test_bnns/infrastructure.py | 4 +- .../contrib/test_bnns/test_onnx_topologies.py | 4 +- tests/python/contrib/test_coreml_codegen.py | 2 +- .../contrib/test_ethosn/infrastructure.py | 4 +- tests/python/contrib/test_tensorrt.py | 8 +- .../contrib/test_vitis_ai/infrastructure.py | 4 +- tests/python/frontend/caffe/test_forward.py | 4 +- tests/python/frontend/caffe2/test_forward.py | 4 +- tests/python/frontend/coreml/test_forward.py | 8 +- tests/python/frontend/darknet/test_forward.py | 4 +- tests/python/frontend/keras/test_forward.py | 4 +- tests/python/frontend/mxnet/test_forward.py | 8 +- .../frontend/mxnet/test_qnn_ops_utils.py | 6 +- tests/python/frontend/onnx/test_forward.py | 4 +- tests/python/frontend/pytorch/qnn_test.py | 2 +- tests/python/frontend/pytorch/test_forward.py | 6 +- .../frontend/tensorflow/test_bn_dynamic.py | 4 +- .../frontend/tensorflow/test_forward.py | 16 +- tests/python/frontend/tflite/test_forward.py | 8 +- .../test_quantization_accuracy.py | 2 +- .../python/relay/benchmarking/benchmark_vm.py | 12 +- ..._auto_scheduler_layout_rewrite_networks.py | 4 +- .../relay/test_auto_scheduler_tuning.py | 4 +- ...time.py => test_backend_graph_executor.py} | 8 +- tests/python/relay/test_cpp_build_module.py | 6 +- tests/python/relay/test_external_codegen.py | 10 +- tests/python/relay/test_json_runtime.py | 8 +- tests/python/relay/test_op_fast_math.py | 4 +- tests/python/relay/test_op_level2.py | 4 +- tests/python/relay/test_op_qnn_concatenate.py | 2 +- .../relay/test_op_qnn_conv2_transpose.py | 4 +- tests/python/relay/test_op_qnn_conv2d.py | 10 +- tests/python/relay/test_op_qnn_dense.py | 4 +- tests/python/relay/test_op_qnn_dequantize.py | 6 +- tests/python/relay/test_op_qnn_mul.py | 2 +- tests/python/relay/test_op_qnn_quantize.py | 6 +- tests/python/relay/test_op_qnn_requantize.py | 4 +- .../relay/test_op_qnn_simulated_dequantize.py | 4 +- .../relay/test_op_qnn_simulated_quantize.py | 4 +- tests/python/relay/test_param_dict.py | 8 +- .../python/relay/test_pass_annotate_target.py | 6 +- tests/python/relay/test_pass_annotation.py | 16 +- tests/python/relay/test_pass_legalize.py | 2 +- .../relay/test_pass_legalize_tensorcore.py | 2 +- .../python/relay/test_pass_partition_graph.py | 6 +- tests/python/relay/test_pass_qnn_legalize.py | 2 +- .../relay/test_simplify_fc_transpose.py | 4 +- .../python/relay/test_sparse_dense_convert.py | 4 +- tests/python/topi/python/test_topi_qnn.py | 6 +- tests/python/unittest/test_crt.py | 6 +- tests/python/unittest/test_link_params.py | 20 +- .../test_micro_model_library_format.py | 4 +- tests/python/unittest/test_runtime_graph.py | 14 +- .../unittest/test_runtime_graph_cuda_graph.py | 6 +- .../unittest/test_runtime_graph_debug.py | 6 +- .../unittest/test_runtime_heterogeneous.py | 8 +- .../test_runtime_module_based_interface.py | 66 ++-- .../unittest/test_target_codegen_blob.py | 6 +- .../unittest/test_tir_transform_hoist_if.py | 4 +- tests/scripts/task_config_build_gpu.sh | 2 +- tests/scripts/task_java_unittest.sh | 2 +- tests/scripts/task_rust.sh | 4 +- tutorials/auto_scheduler/tune_network_arm.py | 6 +- tutorials/auto_scheduler/tune_network_cuda.py | 6 +- tutorials/auto_scheduler/tune_network_mali.py | 6 +- tutorials/auto_scheduler/tune_network_x86.py | 6 +- tutorials/autotvm/tune_relay_arm.py | 2 +- tutorials/autotvm/tune_relay_cuda.py | 2 +- tutorials/autotvm/tune_relay_mobile_gpu.py | 2 +- tutorials/autotvm/tune_relay_x86.py | 2 +- tutorials/frontend/build_gcn.py | 6 +- tutorials/frontend/deploy_model_on_android.py | 4 +- tutorials/frontend/deploy_model_on_rasp.py | 2 +- tutorials/frontend/deploy_prequantized.py | 2 +- .../frontend/deploy_prequantized_tflite.py | 4 +- tutorials/frontend/deploy_sparse.py | 4 +- tutorials/frontend/deploy_ssd_gluoncv.py | 4 +- tutorials/frontend/from_caffe2.py | 4 +- tutorials/frontend/from_coreml.py | 4 +- tutorials/frontend/from_darknet.py | 4 +- tutorials/frontend/from_mxnet.py | 4 +- tutorials/frontend/from_pytorch.py | 4 +- tutorials/frontend/from_tensorflow.py | 4 +- tutorials/frontend/from_tflite.py | 2 +- tutorials/frontend/using_external_lib.py | 2 +- tutorials/get_started/relay_quick_start.py | 8 +- tutorials/micro/micro_tflite.py | 4 +- vta/scripts/tune_resnet.py | 8 +- vta/tutorials/autotvm/tune_relay_vta.py | 6 +- .../frontend/deploy_classification.py | 12 +- .../frontend/legacy/deploy_detection.py | 10 +- web/emcc/wasm_runtime.cc | 2 +- web/src/runtime.ts | 18 +- 208 files changed, 1727 insertions(+), 1641 deletions(-) rename docs/api/python/{graph_runtime.rst => graph_executor.rst} (92%) rename include/tvm/runtime/crt/{graph_runtime.h => graph_executor.h} (61%) rename include/tvm/runtime/crt/{graph_runtime_module.h => graph_executor_module.h} (71%) rename jvm/core/src/main/java/org/apache/tvm/contrib/{GraphRuntime.java => GraphExecutor.java} (95%) rename jvm/core/src/test/java/org/apache/tvm/contrib/{GraphRuntimeTest.java => GraphExecutorTest.java} (93%) rename jvm/core/src/test/scripts/{test_graph_runtime.py => test_graph_executor.py} (98%) rename python/tvm/contrib/cuda_graph/{cuda_graph_runtime.py => cuda_graph_executor.py} (78%) create mode 100644 python/tvm/contrib/debugger/debug_executor.py create mode 100644 python/tvm/contrib/graph_executor.py rename python/tvm/relay/backend/{graph_runtime_codegen.py => graph_executor_codegen.py} (91%) rename python/tvm/relay/backend/{graph_runtime_factory.py => graph_executor_factory.py} (88%) rename src/relay/backend/{graph_runtime_codegen.cc => graph_executor_codegen.cc} (96%) rename src/runtime/crt/{graph_runtime/graph_runtime.c => graph_executor/graph_executor.c} (76%) rename src/runtime/crt/{graph_runtime => graph_executor}/load_json.c (99%) rename src/runtime/crt/{graph_runtime_module/graph_runtime_module.c => graph_executor_module/graph_executor_module.c} (51%) rename src/runtime/crt/include/tvm/runtime/crt/internal/{graph_runtime/graph_runtime.h => graph_executor/graph_executor.h} (58%) rename src/runtime/crt/include/tvm/runtime/crt/internal/{graph_runtime => graph_executor}/load_json.h (90%) rename src/runtime/{graph => graph_executor}/cuda_graph/graph_runtime_cuda_graph.cc (76%) rename src/runtime/{graph/debug/graph_runtime_debug.cc => graph_executor/debug/graph_executor_debug.cc} (89%) rename src/runtime/{graph/graph_runtime.cc => graph_executor/graph_executor.cc} (88%) rename src/runtime/{graph/graph_runtime.h => graph_executor/graph_executor.h} (95%) rename src/runtime/{graph/graph_runtime_factory.cc => graph_executor/graph_executor_factory.cc} (65%) rename src/runtime/{graph/graph_runtime_factory.h => graph_executor/graph_executor_factory.h} (68%) rename src/runtime/micro/standalone/{utvm_graph_runtime.cc => utvm_graph_executor.cc} (96%) rename src/runtime/micro/standalone/{utvm_graph_runtime.h => utvm_graph_executor.h} (90%) rename tests/python/relay/{test_backend_graph_runtime.py => test_backend_graph_executor.py} (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d37bd4e6e446..277fe4a9bfbc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,9 +34,9 @@ tvm_option(USE_RPC "Build with RPC" ON) tvm_option(USE_THREADS "Build with thread support" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) -tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) -tvm_option(USE_GRAPH_RUNTIME_CUDA_GRAPH "Build with tiny graph runtime with CUDA Graph for GPUs" OFF) -tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON) +tvm_option(USE_GRAPH_EXECUTOR "Build with tiny graph executor" ON) +tvm_option(USE_GRAPH_EXECUTOR_CUDA_GRAPH "Build with tiny graph executor with CUDA Graph for GPUs" OFF) +tvm_option(USE_PROFILER "Build profiler for the VM and graph executor" ON) tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF) tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF) tvm_option(USE_RTTI "Build with RTTI" ON) @@ -79,7 +79,7 @@ tvm_option(USE_COREML "Build with coreml support" OFF) tvm_option(USE_BNNS "Build with BNNS support" OFF) tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF) tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF) -tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF) +tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "Build with Arm Compute Library graph executor" OFF) tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF) tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF) tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF) @@ -307,16 +307,30 @@ else() list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS}) endif(USE_STACKVM_RUNTIME) -if(USE_GRAPH_RUNTIME) - message(STATUS "Build with Graph runtime support...") - file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) - list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS}) +# NOTE(areusch): USE_GRAPH_RUNTIME will be deleted in a future release +if(USE_GRAPH_RUNTIME AND NOT DEFINED USE_GRAPH_EXECUTOR) + message(WARNING "USE_GRAPH_RUNTIME renamed to USE_GRAPH_EXECUTOR. Please update your config.cmake") + set(USE_GRAPH_EXECUTOR ${USE_GRAPH_RUNTIME}) + unset(USE_GRAPH_RUNTIME CACHE) +endif(USE_GRAPH_RUNTIME AND NOT DEFINED USE_GRAPH_EXECUTOR) + +# NOTE(areusch): USE_GRAPH_RUNTIME_DEBUG will be deleted in a future release +if(USE_GRAPH_RUNTIME_DEBUG AND NOT DEFINED USE_GRAPH_EXECUTOR_DEBUG) + message(WARNING "USE_GRAPH_RUNTIME_DEBUG renamed to USE_GRAPH_EXECUTOR_DEBUG. Please update your config.cmake") + set(USE_GRAPH_EXECUTOR_DEBUG ${USE_GRAPH_RUNTIME_DEBUG}) + unset(USE_GRAPH_RUNTIME_DEBUG CACHE) +endif(USE_GRAPH_RUNTIME_DEBUG AND NOT DEFINED USE_GRAPH_EXECUTOR_DEBUG) + +if(USE_GRAPH_EXECUTOR) + message(STATUS "Build with Graph Executor support...") + file(GLOB RUNTIME_GRAPH_EXECUTOR_SRCS src/runtime/graph_executor/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_SRCS}) -endif(USE_GRAPH_RUNTIME) +endif(USE_GRAPH_EXECUTOR) # convert old options for profiler -if(USE_GRAPH_RUNTIME_DEBUG) - unset(USE_GRAPH_RUNTIME_DEBUG CACHE) +if(USE_GRAPH_EXECUTOR_DEBUG) + unset(USE_GRAPH_EXECUTOR_DEBUG CACHE) set(USE_PROFILER ON) endif() if(USE_VM_PROFILER) @@ -327,10 +341,10 @@ endif() if(USE_PROFILER) message(STATUS "Build with profiler...") - file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc) - list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS}) - set_source_files_properties(${RUNTIME_GRAPH_SRCS} - PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG") + file(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS}) + set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS} + PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_EXECUTOR_DEBUG") file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS}) diff --git a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java index 53913ef306dc0..8a5f54a3e3998 100644 --- a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java +++ b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java @@ -111,7 +111,7 @@ public class Camera2BasicFragment extends Fragment { private AppCompatTextView mInfoView; private ListView mModelView; private AssetManager assetManager; - private Module graphRuntimeModule; + private Module graphExecutorModule; private JSONObject labels; private ListenableFuture cameraProviderFuture; private PreviewView previewView; @@ -187,21 +187,21 @@ private String[] getModels() { private String[] inference(float[] chw) { NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32")); inputNdArray.copyFrom(chw); - Function setInputFunc = graphRuntimeModule.getFunction("set_input"); + Function setInputFunc = graphExecutorModule.getFunction("set_input"); setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke(); // release tvm local variables inputNdArray.release(); setInputFunc.release(); // get the function from the module(run it) - Function runFunc = graphRuntimeModule.getFunction("run"); + Function runFunc = graphExecutorModule.getFunction("run"); runFunc.invoke(); // release tvm local variables runFunc.release(); // get the function from the module(get output data) NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32")); - Function getOutputFunc = graphRuntimeModule.getFunction("get_output"); + Function getOutputFunc = graphExecutorModule.getFunction("get_output"); getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke(); float[] output = outputNdArray.asFloatArray(); // release tvm local variables @@ -272,8 +272,8 @@ public void onActivityCreated(Bundle savedInstanceState) { @Override public void onDestroy() { // release tvm local variables - if (null != graphRuntimeModule) - graphRuntimeModule.release(); + if (null != graphExecutorModule) + graphExecutorModule.release(); super.onDestroy(); } @@ -516,7 +516,7 @@ private void setInputName(String modelName) { } /* - Load precompiled model on TVM graph runtime and init the system. + Load precompiled model on TVM graph executor and init the system. */ private class LoadModelAsyncTask extends AsyncTask { @@ -581,11 +581,11 @@ protected Integer doInBackground(Void... args) { Module modelLib = Module.load(libCacheFilePath); - // get global function module for graph runtime - Log.i(TAG, "getting graph runtime create handle..."); + // get global function module for graph executor + Log.i(TAG, "getting graph executor create handle..."); - Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create"); - Log.i(TAG, "creating graph runtime..."); + Function runtimeCreFun = Function.getFunction("tvm.graph_executor.create"); + Log.i(TAG, "creating graph executor..."); Log.i(TAG, "device type: " + tvmDev.deviceType); Log.i(TAG, "device id: " + tvmDev.deviceId); @@ -597,10 +597,10 @@ protected Integer doInBackground(Void... args) { .invoke(); Log.i(TAG, "as module..."); - graphRuntimeModule = runtimeCreFunRes.asModule(); - Log.i(TAG, "getting graph runtime load params handle..."); + graphExecutorModule = runtimeCreFunRes.asModule(); + Log.i(TAG, "getting graph executor load params handle..."); // get the function from the module(load parameters) - Function loadParamFunc = graphRuntimeModule.getFunction("load_params"); + Function loadParamFunc = graphExecutorModule.getFunction("load_params"); Log.i(TAG, "loading params..."); loadParamFunc.pushArg(modelParams).invoke(); // release tvm local variables diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h index 406effa34aaa7..f3c7efd08b5c0 100644 --- a/apps/android_camera/app/src/main/jni/tvm_runtime.h +++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h @@ -37,7 +37,7 @@ #include "../src/runtime/cpu_device_api.cc" #include "../src/runtime/dso_library.cc" #include "../src/runtime/file_utils.cc" -#include "../src/runtime/graph/graph_runtime.cc" +#include "../src/runtime/graph_executor/graph_executor.cc" #include "../src/runtime/library_module.cc" #include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py index f155d46c31a45..d767b2ef88fcb 100644 --- a/apps/android_camera/models/prepare_model.py +++ b/apps/android_camera/models/prepare_model.py @@ -25,7 +25,7 @@ import tvm import tvm.relay as relay -from tvm.contrib import utils, ndk, graph_runtime as runtime +from tvm.contrib import utils, ndk, graph_executor as runtime from tvm.contrib.download import download_testdata, download target = "llvm -mtriple=arm64-linux-android" diff --git a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java index 38c135a1edc49..85cc7a277b4d5 100644 --- a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java +++ b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java @@ -90,7 +90,7 @@ public class MainActivity extends AppCompatActivity { private ImageView mImageView; private TextView mResultView; private AssetManager assetManager; - private Module graphRuntimeModule; + private Module graphExecutorModule; private Vector labels = new Vector(); @Override @@ -119,7 +119,7 @@ public void onClick(View v) { } /* - Load precompiled model on TVM graph runtime and init the system. + Load precompiled model on TVM graph executor and init the system. */ private class LoadModleAsyncTask extends AsyncTask { ProgressDialog dialog = new ProgressDialog(MainActivity.this); @@ -183,17 +183,17 @@ protected Integer doInBackground(Void... args) { // tvm module for compiled functions Module modelLib = Module.load(libCacheFilePath); - // get global function module for graph runtime - Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create"); + // get global function module for graph executor + Function runtimeCreFun = Function.getFunction("tvm.graph_executor.create"); TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph) .pushArg(modelLib) .pushArg(tvmDev.deviceType) .pushArg(tvmDev.deviceId) .invoke(); - graphRuntimeModule = runtimeCreFunRes.asModule(); + graphExecutorModule = runtimeCreFunRes.asModule(); // get the function from the module(load parameters) - Function loadParamFunc = graphRuntimeModule.getFunction("load_params"); + Function loadParamFunc = graphExecutorModule.getFunction("load_params"); loadParamFunc.pushArg(modelParams).invoke(); // release tvm local variables @@ -224,14 +224,14 @@ protected void onPostExecute(Integer status) { } /* - Execute prediction for processed decode input bitmap image content on TVM graph runtime. + Execute prediction for processed decode input bitmap image content on TVM graph executor. */ private class ModelRunAsyncTask extends AsyncTask { ProgressDialog dialog = new ProgressDialog(MainActivity.this); @Override protected Integer doInBackground(Bitmap... bitmaps) { - if (null != graphRuntimeModule) { + if (null != graphExecutorModule) { int count = bitmaps.length; for (int i = 0 ; i < count ; i++) { long processingTimeMs = SystemClock.uptimeMillis(); @@ -283,7 +283,7 @@ protected Integer doInBackground(Bitmap... bitmaps) { Log.i(TAG, "set input data"); NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));; inputNdArray.copyFrom(imgRgbTranValues); - Function setInputFunc = graphRuntimeModule.getFunction("set_input"); + Function setInputFunc = graphExecutorModule.getFunction("set_input"); setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke(); // release tvm local variables inputNdArray.release(); @@ -291,7 +291,7 @@ protected Integer doInBackground(Bitmap... bitmaps) { // get the function from the module(run it) Log.i(TAG, "run function on target"); - Function runFunc = graphRuntimeModule.getFunction("run"); + Function runFunc = graphExecutorModule.getFunction("run"); runFunc.invoke(); // release tvm local variables runFunc.release(); @@ -299,7 +299,7 @@ protected Integer doInBackground(Bitmap... bitmaps) { // get the function from the module(get output data) Log.i(TAG, "get output data"); NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32")); - Function getOutputFunc = graphRuntimeModule.getFunction("get_output"); + Function getOutputFunc = graphExecutorModule.getFunction("get_output"); getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke(); float[] output = outputNdArray.asFloatArray(); // release tvm local variables @@ -343,7 +343,7 @@ protected void onPostExecute(Integer status) { dialog.dismiss(); } if (status != 0) { - showDialog("Error", "Fail to predict image, GraphRuntime exception"); + showDialog("Error", "Fail to predict image, GraphExecutor exception"); } } } @@ -351,8 +351,8 @@ protected void onPostExecute(Integer status) { @Override protected void onDestroy() { // release tvm local variables - if (null != graphRuntimeModule) - graphRuntimeModule.release(); + if (null != graphExecutorModule) + graphExecutorModule.release(); super.onDestroy(); } diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h index a89475440714e..725b5e1d3b7ab 100644 --- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -32,7 +32,7 @@ #include "../src/runtime/cpu_device_api.cc" #include "../src/runtime/dso_library.cc" #include "../src/runtime/file_utils.cc" -#include "../src/runtime/graph/graph_runtime.cc" +#include "../src/runtime/graph_executor/graph_executor.cc" #include "../src/runtime/library_module.cc" #include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index 9503f02f39ef0..5dcd823929ca9 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -37,8 +37,8 @@ #include "../src/runtime/cpu_device_api.cc" #include "../src/runtime/dso_library.cc" #include "../src/runtime/file_utils.cc" -#include "../src/runtime/graph/graph_runtime.cc" -#include "../src/runtime/graph/graph_runtime_factory.cc" +#include "../src/runtime/graph_executor/graph_executor.cc" +#include "../src/runtime/graph_executor/graph_executor_factory.cc" #include "../src/runtime/library_module.cc" #include "../src/runtime/logging.cc" #include "../src/runtime/module.cc" diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py index 915f2303b9ed2..656735ec6c051 100644 --- a/apps/benchmark/arm_cpu_imagenet_bench.py +++ b/apps/benchmark/arm_cpu_imagenet_bench.py @@ -24,7 +24,7 @@ import tvm from tvm import te from tvm.contrib.utils import tempdir -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime from tvm import relay from util import get_network, print_progress diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py index 6d91aff74fde3..6407f766cb768 100644 --- a/apps/benchmark/gpu_imagenet_bench.py +++ b/apps/benchmark/gpu_imagenet_bench.py @@ -24,7 +24,7 @@ import tvm from tvm import te -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime from tvm import relay from util import get_network diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py index 3144aee080dc9..4eff259875ca7 100644 --- a/apps/benchmark/mobile_gpu_imagenet_bench.py +++ b/apps/benchmark/mobile_gpu_imagenet_bench.py @@ -24,7 +24,7 @@ import tvm from tvm import te from tvm.contrib.utils import tempdir -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime from tvm import relay from util import get_network, print_progress diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile index 8e23a92afa935..b533030c4b822 100644 --- a/apps/bundle_deploy/Makefile +++ b/apps/bundle_deploy/Makefile @@ -84,8 +84,8 @@ test_static: $(build_dir)/test_static $(build_dir)/test_data_c.bin $(build_dir)/ $(build_dir)/crt/libcommon.a: $(CRT_SRCS) $(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" common -$(build_dir)/crt/libgraph_runtime.a: $(CRT_SRCS) - $(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" graph_runtime +$(build_dir)/crt/libgraph_executor.a: $(CRT_SRCS) + $(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" graph_executor $(build_dir)/crt/libmemory.a: $(CRT_SRCS) $(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" memory @@ -98,11 +98,11 @@ $(build_dir)/test_dynamic: test.cc ${build_dir}/test_graph_c.json ${build_dir}/t $(QUIET)mkdir -p $(@D) $(QUIET)g++ $(PKG_CXXFLAGS) -o $@ test.cc $(BACKTRACE_OBJS) $(BACKTRACE_LDFLAGS) -$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS) +$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_LDFLAGS) @@ -140,7 +140,7 @@ $(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model_cpp.o $(QUIET)mkdir -p $(@D) $(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) -$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) @@ -148,7 +148,7 @@ $(build_dir)/test_bundle.so: bundle.cc runtime.cc $(build_dir)/test_model_cpp.o $(QUIET)mkdir -p $(@D) $(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) -$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md index a52d3a78f9c9b..619a2d7d05ccc 100644 --- a/apps/bundle_deploy/README.md +++ b/apps/bundle_deploy/README.md @@ -20,9 +20,9 @@ How to Bundle TVM Modules ========================= This folder contains an example on how to bundle a TVM module (with the required -interpreter runtime modules such as `runtime::GraphRuntime`, the graph JSON, and +interpreter runtime modules such as `runtime::GraphExecutor`, the graph JSON, and the params) into a single, self-contained shared object (`bundle.so`) which -exposes a C API wrapping the appropriate `runtime::GraphRuntime` instance. +exposes a C API wrapping the appropriate `runtime::GraphExecutor` instance. This is useful for cases where we'd like to avoid deploying the TVM runtime components to the target host in advance - instead, we simply deploy the bundled @@ -49,8 +49,8 @@ This will: - Build a `bundle.so` shared object containing the model specification and parameters - Build a `demo_dynamic` executable that `dlopen`'s `bundle.so` (or `bundle_c.so` in - terms of the MISRA-C runtime), instantiates the contained graph runtime, - and invokes the `GraphRuntime::Run` function on a cat image, then prints + terms of the MISRA-C runtime), instantiates the contained graph executor, + and invokes the `GraphExecutor::Run` function on a cat image, then prints the output results. Type the following command to run the sample code with static linking. diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c index 84740aa251306..9083f7b5f48b9 100644 --- a/apps/bundle_deploy/bundle.c +++ b/apps/bundle_deploy/bundle.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -75,30 +75,30 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, TVMModuleHandle mod_syslib = TVMArgs_AsModuleHandle(&pf.ret_value, 0); // run modules - TVMGraphRuntime* graph_runtime = NULL; - TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &dev, &graph_runtime)); - TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size)); + TVMGraphExecutor* graph_executor = NULL; + TVM_CCALL(TVMGraphExecutor_Create(json_data, mod_syslib, &dev, &graph_executor)); + TVM_CCALL(TVMGraphExecutor_LoadParams(graph_executor, params.data, params.size)); - return graph_runtime; + return graph_executor; } -TVM_DLL void tvm_runtime_destroy(void* runtime) { - TVMGraphRuntime_Release((TVMGraphRuntime**)&runtime); +TVM_DLL void tvm_runtime_destroy(void* executor) { + TVMGraphExecutor_Release((TVMGraphExecutor**)&executor); } -TVM_DLL void tvm_runtime_set_input(void* runtime, const char* name, DLTensor* tensor) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_SetInput(graph_runtime, name, tensor); +TVM_DLL void tvm_runtime_set_input(void* executor, const char* name, DLTensor* tensor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_SetInput(graph_executor, name, tensor); } -TVM_DLL void tvm_runtime_run(void* runtime) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_Run(graph_runtime); +TVM_DLL void tvm_runtime_run(void* executor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_Run(graph_executor); } -TVM_DLL void tvm_runtime_get_output(void* runtime, int32_t index, DLTensor* tensor) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_GetOutput(graph_runtime, index, tensor); +TVM_DLL void tvm_runtime_get_output(void* executor, int32_t index, DLTensor* tensor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_GetOutput(graph_executor, index, tensor); } void TVMLogf(const char* msg, ...) { diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc index e3cc7d1730ce0..435d0e41f3db3 100644 --- a/apps/bundle_deploy/bundle.cc +++ b/apps/bundle_deploy/bundle.cc @@ -35,7 +35,7 @@ TVM_BUNDLE_FUNCTION void* tvm_runtime_create(const char* build_graph_json, int device_type = kDLCPU; int device_id = 0; - tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))( + tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_executor.create"))( json_data, mod_syslib, device_type, device_id); TVMByteArray params; params.data = reinterpret_cast(&build_params_bin[0]); diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c index ca75b9e0b2e3e..62e63d6b4fe2f 100644 --- a/apps/bundle_deploy/bundle_static.c +++ b/apps/bundle_deploy/bundle_static.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -75,31 +75,31 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data, TVMModuleHandle mod_syslib = TVMArgs_AsModuleHandle(&pf.ret_value, 0); // run modules - TVMGraphRuntime* graph_runtime = NULL; - TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &dev, &graph_runtime)); - TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size)); + TVMGraphExecutor* graph_executor = NULL; + TVM_CCALL(TVMGraphExecutor_Create(json_data, mod_syslib, &dev, &graph_executor)); + TVM_CCALL(TVMGraphExecutor_LoadParams(graph_executor, params.data, params.size)); - return graph_runtime; + return graph_executor; } -TVM_DLL void tvm_runtime_destroy(void* runtime) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_Release(&graph_runtime); +TVM_DLL void tvm_runtime_destroy(void* executor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_Release(&graph_executor); } -TVM_DLL void tvm_runtime_set_input(void* runtime, const char* name, DLTensor* tensor) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_SetInput(graph_runtime, name, tensor); +TVM_DLL void tvm_runtime_set_input(void* executor, const char* name, DLTensor* tensor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_SetInput(graph_executor, name, tensor); } -TVM_DLL void tvm_runtime_run(void* runtime) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_Run(graph_runtime); +TVM_DLL void tvm_runtime_run(void* executor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_Run(graph_executor); } -TVM_DLL void tvm_runtime_get_output(void* runtime, int32_t index, DLTensor* tensor) { - TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime; - TVMGraphRuntime_GetOutput(graph_runtime, index, tensor); +TVM_DLL void tvm_runtime_get_output(void* executor, int32_t index, DLTensor* tensor) { + TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor; + TVMGraphExecutor_GetOutput(graph_executor, index, tensor); } void TVMLogf(const char* msg, ...) { diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc index 2f7e3848b4bfe..7a2573b643f5f 100644 --- a/apps/bundle_deploy/runtime.cc +++ b/apps/bundle_deploy/runtime.cc @@ -26,7 +26,7 @@ #include "../../src/runtime/container.cc" #include "../../src/runtime/cpu_device_api.cc" #include "../../src/runtime/file_utils.cc" -#include "../../src/runtime/graph/graph_runtime.cc" +#include "../../src/runtime/graph_executor/graph_executor.cc" #include "../../src/runtime/library_module.cc" #include "../../src/runtime/module.cc" #include "../../src/runtime/ndarray.cc" diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc index f58648c2fb7a1..8500ddb5fabe9 100644 --- a/apps/howto_deploy/cpp_deploy.cc +++ b/apps/howto_deploy/cpp_deploy.cc @@ -83,12 +83,12 @@ void DeploySingleOp() { Verify(mod_syslib, "addonesys"); } -void DeployGraphRuntime() { - LOG(INFO) << "Running graph runtime..."; +void DeployGraphExecutor() { + LOG(INFO) << "Running graph executor..."; // load in the library DLDevice dev{kDLCPU, 0}; tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/test_relay_add.so"); - // create the graph runtime module + // create the graph executor module tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev); tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input"); tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output"); @@ -119,6 +119,6 @@ void DeployGraphRuntime() { int main(void) { DeploySingleOp(); - DeployGraphRuntime(); + DeployGraphExecutor(); return 0; } diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc index d6dd5876a994a..c8778a3802335 100644 --- a/apps/howto_deploy/tvm_runtime_pack.cc +++ b/apps/howto_deploy/tvm_runtime_pack.cc @@ -58,9 +58,9 @@ #include "../../src/runtime/dso_library.cc" #include "../../src/runtime/system_library.cc" -// Graph runtime -#include "../../src/runtime/graph/graph_runtime.cc" -#include "../../src/runtime/graph/graph_runtime_factory.cc" +// Graph executor +#include "../../src/runtime/graph_executor/graph_executor.cc" +#include "../../src/runtime/graph_executor/graph_executor_factory.cc" // Uncomment the following lines to enable RPC // #include "../../src/runtime/rpc/rpc_session.cc" diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py index 50468d3011343..ee6ab5fd83635 100644 --- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py +++ b/apps/ios_rpc/tests/ios_rpc_mobilenet.py @@ -22,7 +22,7 @@ from tvm.relay import transform from tvm.relay.op.annotation import compiler_begin, compiler_end from tvm.relay.quantize.quantize import prerequisite_optimize -from tvm.contrib import utils, xcode, graph_runtime, coreml_runtime +from tvm.contrib import utils, xcode, graph_executor, coreml_runtime from tvm.contrib.target import coreml as _coreml import os @@ -120,7 +120,7 @@ def run(mod, target): else: dev = remote.cpu(0) lib = remote.load_module("deploy.dylib") - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) m.set_input("data", tvm.nd.array(image, dev)) m.run() diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm index 87cb6f9b4c691..7ab9a4d2d219c 100644 --- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm +++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm @@ -45,8 +45,8 @@ #include "../../../src/runtime/rpc/rpc_server_env.cc" #include "../../../src/runtime/rpc/rpc_session.cc" #include "../../../src/runtime/rpc/rpc_socket_impl.cc" -// Graph runtime -#include "../../../src/runtime/graph/graph_runtime.cc" +// Graph executor +#include "../../../src/runtime/graph_executor/graph_executor.cc" // Metal #include "../../../src/runtime/metal/metal_device_api.mm" #include "../../../src/runtime/metal/metal_module.mm" diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh index 4672012e73f22..2eb55e385520b 100755 --- a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh +++ b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh @@ -28,7 +28,7 @@ fi cp cmake/config.cmake "${BUILD_DIR}" cd "${BUILD_DIR}" sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake -sed -i 's/USE_GRAPH_RUNTIME_DEBUG OFF/USE_GRAPH_RUNTIME_DEBUG ON/' config.cmake +sed -i 's/USE_GRAPH_EXECUTOR_DEBUG OFF/USE_GRAPH_EXECUTOR_DEBUG ON/' config.cmake sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake cmake .. rm -rf standalone_crt host_standalone_crt # remove stale generated files diff --git a/cmake/config.cmake b/cmake/config.cmake index 98d1d97801037..7b29df648ac7c 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -96,13 +96,13 @@ set(USE_CPP_RPC OFF) # Whether embed stackvm into the runtime set(USE_STACKVM_RUNTIME OFF) -# Whether enable tiny embedded graph runtime. -set(USE_GRAPH_RUNTIME ON) +# Whether enable tiny embedded graph executor. +set(USE_GRAPH_EXECUTOR ON) -# Whether enable tiny graph runtime with CUDA Graph -set(USE_GRAPH_RUNTIME_CUDA_GRAPH OFF) +# Whether enable tiny graph executor with CUDA Graph +set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF) -# Whether to enable the profiler for the graph runtime and vm +# Whether to enable the profiler for the graph executor and vm set(USE_PROFILER ON) # Whether enable uTVM standalone runtime @@ -207,10 +207,10 @@ set(USE_DNNL_CODEGEN OFF) # # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported # operators to Arm Compute Library. OFF/ON -# USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME - Run Arm Compute Library annotated functions via the ACL +# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL # runtime. OFF/ON/"path/to/ACL" set(USE_ARM_COMPUTE_LIB OFF) -set(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME OFF) +set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF) # Whether to build with Arm Ethos-N support # Possible values: diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 262a4e6e71239..1bdc5036f8575 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -65,15 +65,15 @@ if(USE_CUDA) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) endif(USE_THRUST) - if(USE_GRAPH_RUNTIME_CUDA_GRAPH) - if(NOT USE_GRAPH_RUNTIME) - message(FATAL_ERROR "CUDA Graph is only supported by graph runtime, please set USE_GRAPH_RUNTIME=ON") + if(USE_GRAPH_EXECUTOR_CUDA_GRAPH) + if(NOT USE_GRAPH_EXECUTOR) + message(FATAL_ERROR "CUDA Graph is only supported by graph executor, please set USE_GRAPH_EXECUTOR=ON") endif() if(CUDAToolkit_VERSION_MAJOR LESS "10") message(FATAL_ERROR "CUDA Graph requires CUDA 10 or above, got=" ${CUDAToolkit_VERSION}) endif() - message(STATUS "Build with Graph runtime with CUDA Graph support...") - file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph/cuda_graph/*.cc) + message(STATUS "Build with Graph executor with CUDA Graph support...") + file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph_executor/cuda_graph/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_GRAPH_SRCS}) endif() else(USE_CUDA) diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake index 131dceeb345d6..2a69d06970a8f 100644 --- a/cmake/modules/LibInfo.cmake +++ b/cmake/modules/LibInfo.cmake @@ -42,8 +42,8 @@ function(add_lib_info src_file) TVM_INFO_USE_LLVM="${USE_LLVM}" TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}" TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}" - TVM_INFO_USE_GRAPH_RUNTIME="${USE_GRAPH_RUNTIME}" - TVM_INFO_USE_GRAPH_RUNTIME_DEBUG="${USE_GRAPH_RUNTIME_DEBUG}" + TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}" + TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG="${USE_GRAPH_EXECUTOR_DEBUG}" TVM_INFO_USE_OPENMP="${USE_OPENMP}" TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}" TVM_INFO_USE_RTTI="${USE_RTTI}" @@ -73,7 +73,7 @@ function(add_lib_info src_file) TVM_INFO_USE_COREML="${USE_COREML}" TVM_INFO_USE_TARGET_ONNX="${USE_TARGET_ONNX}" TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}" - TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME="${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}" + TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}" TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}" TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}" ) diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake index dc1b3b2665f26..fe6baf81c3e5e 100644 --- a/cmake/modules/StandaloneCrt.cmake +++ b/cmake/modules/StandaloneCrt.cmake @@ -43,8 +43,8 @@ if(USE_MICRO) "src/runtime/crt Makefile -> ." "src/runtime/crt/include *.h -> include" "src/runtime/crt/common *.c -> src/runtime/crt/common" - "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime" - "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module" + "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor" + "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module" "src/runtime/crt/host crt_config.h -> template/host" "src/runtime/crt/host *.cc -> template/host" "src/runtime/crt/memory *.c -> src/runtime/crt/memory" @@ -97,7 +97,7 @@ if(USE_MICRO) set(make_quiet ) endif(${VERBOSE}) - list(APPEND crt_libraries memory graph_runtime utvm_rpc_server utvm_rpc_common common) # NOTE: listed in link order. + list(APPEND crt_libraries memory graph_executor utvm_rpc_server utvm_rpc_common common) # NOTE: listed in link order. foreach(crt_lib_name IN LISTS crt_libraries) list(APPEND crt_library_paths "host_standalone_crt/lib${crt_lib_name}.a") endforeach() diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake index ba082505125b4..54ce917dfb506 100644 --- a/cmake/modules/contrib/ArmComputeLib.cmake +++ b/cmake/modules/contrib/ArmComputeLib.cmake @@ -23,17 +23,25 @@ if(USE_ARM_COMPUTE_LIB) file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc) file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc) list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC}) - if(NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME) + + if(NOT USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR) list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE}) endif() message(STATUS "Build with Arm Compute Library support...") endif() -if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME) +if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME AND NOT DEFINED USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR) + message(WARNING "USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME renamed to USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR. " + "Please update your config.cmake") + set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR ${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}) + unset(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME CACHE) +endif(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME AND NOT DEFINED USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR) + +if(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR) set(ACL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/acl) # Detect custom ACL path. - if (NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME STREQUAL "ON") - set(ACL_PATH ${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}) + if (NOT USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR STREQUAL "ON") + set(ACL_PATH ${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}) endif() file(GLOB ACL_CONTRIB_SRC src/runtime/contrib/arm_compute_lib/*) @@ -60,11 +68,11 @@ if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME) list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_CORE_LIB}) list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_GRAPH_LIB}) list(APPEND RUNTIME_SRCS ${ACL_CONTRIB_SRC}) - message(STATUS "Build with Arm Compute Library graph runtime support: " + message(STATUS "Build with Arm Compute Library graph executor support: " ${EXTERN_ACL_COMPUTE_LIB} ", \n" ${EXTERN_ACL_COMPUTE_CORE_LIB} ", \n" ${EXTERN_ACL_COMPUTE_GRAPH_LIB}) - # Set flag to detect ACL graph runtime support. - add_definitions(-DTVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB) + # Set flag to detect ACL graph executor support. + add_definitions(-DTVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB) endif() diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake index 0c7e43c0fcf87..218f0b2e20fe5 100644 --- a/cmake/modules/contrib/TensorRT.cmake +++ b/cmake/modules/contrib/TensorRT.cmake @@ -55,5 +55,5 @@ if(USE_TENSORRT_RUNTIME) list(APPEND RUNTIME_SRCS ${RUNTIME_TENSORRT_SRCS}) # Set defines - add_definitions(-DTVM_GRAPH_RUNTIME_TENSORRT) + add_definitions(-DTVM_GRAPH_EXECUTOR_TENSORRT) endif() diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat index 9fc0469febc6d..e877b8fda1e17 100644 --- a/conda/recipe/bld.bat +++ b/conda/recipe/bld.bat @@ -28,7 +28,7 @@ cmake ^ -DUSE_CPP_RPC=ON ^ -DUSE_SORT=ON ^ -DUSE_RANDOM=ON ^ - -DUSE_GRAPH_RUNTIME_DEBUG=ON ^ + -DUSE_GRAPH_EXECUTOR_DEBUG=ON ^ -DINSTALL_DEV=ON ^ %SRC_DIR% diff --git a/conda/recipe/build.sh b/conda/recipe/build.sh index 828e3c39488a0..a94b9df72440b 100755 --- a/conda/recipe/build.sh +++ b/conda/recipe/build.sh @@ -49,7 +49,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \ -DUSE_CPP_RPC=OFF \ -DUSE_SORT=ON \ -DUSE_RANDOM=ON \ - -DUSE_GRAPH_RUNTIME_DEBUG=ON \ + -DUSE_GRAPH_EXECUTOR_DEBUG=ON \ -DUSE_LLVM=ON \ -DINSTALL_DEV=ON \ -DUSE_LIBBACKTRACE=AUTO \ diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android index 039439a937e91..f56f56728e70c 100644 --- a/docker/Dockerfile.demo_android +++ b/docker/Dockerfile.demo_android @@ -61,7 +61,7 @@ RUN cd /usr && \ -DUSE_LLVM=llvm-config-8 \ -DUSE_RPC=ON \ -DUSE_SORT=ON \ - -DUSE_GRAPH_RUNTIME=ON \ + -DUSE_GRAPH_EXECUTOR=ON \ -DUSE_VULKAN=ON \ .. && \ make -j10 diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh index c3a15fa26b6d2..48e6df3597db4 100755 --- a/docker/install/install_tvm_cpu.sh +++ b/docker/install/install_tvm_cpu.sh @@ -27,7 +27,7 @@ cd /usr/tvm git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470 echo set\(USE_LLVM llvm-config-8\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake +echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake echo set\(USE_BLAS openblas\) >> config.cmake mkdir -p build cd build diff --git a/docs/api/python/graph_runtime.rst b/docs/api/python/graph_executor.rst similarity index 92% rename from docs/api/python/graph_runtime.rst rename to docs/api/python/graph_executor.rst index d82c7ce00e2ee..3f8811553ba43 100644 --- a/docs/api/python/graph_runtime.rst +++ b/docs/api/python/graph_executor.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -tvm.contrib.graph_runtime +tvm.contrib.graph_executor ------------------------- -.. automodule:: tvm.contrib.graph_runtime +.. automodule:: tvm.contrib.graph_executor :members: diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst index a6179684413dd..76322a1acfe20 100644 --- a/docs/api/python/index.rst +++ b/docs/api/python/index.rst @@ -44,6 +44,6 @@ Python API rpc micro contrib - graph_runtime + graph_executor topi vta/index diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst index c30f226e8437c..ffe8a9a8ce794 100644 --- a/docs/api/python/relay/backend.rst +++ b/docs/api/python/relay/backend.rst @@ -26,7 +26,7 @@ tvm.relay.backend .. automodule:: tvm.relay.backend.compile_engine :members: -.. automodule:: tvm.relay.backend.graph_runtime_codegen +.. automodule:: tvm.relay.backend.graph_executor_codegen :members: .. automodule:: tvm.relay.backend.vm diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index 10a0d51e4f915..4e43682a240a7 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -52,7 +52,7 @@ We recommend two different ways to build and install ACL: mv ./linux--neon/* . -In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME to the path where the ACL package +In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR to the path where the ACL package is located. Cmake will look in /path-to-acl/ along with /path-to-acl/lib and /path-to-acl/build for the required binaries. See the section below for more information on how to use these configuration options. @@ -64,15 +64,15 @@ because ACL cannot be used on an x86 machine. However, we still want to be able runtime module on an x86 machine. * USE_ARM_COMPUTE_LIB=ON/OFF - Enabling this flag will add support for compiling an ACL runtime module. -* USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON/OFF/path-to-acl - Enabling this flag will allow the graph runtime to +* USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON/OFF/path-to-acl - Enabling this flag will allow the graph executor to compute the ACL offloaded functions. These flags can be used in different scenarios depending on your setup. For example, if you want to compile an ACL module on an x86 machine and then run the module on a remote Arm device via RPC, you will -need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON on the remote +need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON on the remote AArch64 device. -By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON will mean that ACL +By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON will mean that ACL binaries are searched for by cmake in the default locations (see https://cmake.org/cmake/help/v3.4/command/find_library.html). In addition to this, /path-to-tvm-project/acl/ will also be searched. It is likely that you will need to set your own path to @@ -83,7 +83,7 @@ These flags should be set in your config.cmake file. For example: .. code:: cmake set(USE_ARM_COMPUTE_LIB ON) - set(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME /path/to/acl) + set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR /path/to/acl) Usage @@ -150,7 +150,7 @@ https://tvm.apache.org/docs/tutorials/get_started/cross_compilation_and_rpc.html dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module('lib_acl.so') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) + gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) d_data = np.random.uniform(0, 1, data_shape).astype(data_type) map_inputs = {'data': d_data} gen_module.set_input(**map_inputs) diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst index 6e20f3c2cdbb8..7b62fb15a617f 100644 --- a/docs/deploy/bnns.rst +++ b/docs/deploy/bnns.rst @@ -145,11 +145,11 @@ Load module and run inference on the target machine with TVM built with ``USE_B import tvm import numpy as np - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module('compiled.dylib') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) + gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) dtype = "float32" input_shape = (1, 3, 224, 224) diff --git a/docs/deploy/tensorrt.rst b/docs/deploy/tensorrt.rst index 0732a32c01bf8..308db4933ae80 100644 --- a/docs/deploy/tensorrt.rst +++ b/docs/deploy/tensorrt.rst @@ -126,7 +126,7 @@ have to be built. dev = tvm.gpu(0) loaded_lib = tvm.runtime.load_module('compiled.so') - gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](dev)) + gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) input_data = np.random.uniform(0, 1, input_shape).astype(dtype) gen_module.run(data=input_data) diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst index fc887dae968c1..1ce89ebed9c2f 100755 --- a/docs/deploy/vitis_ai.rst +++ b/docs/deploy/vitis_ai.rst @@ -449,7 +449,7 @@ TVM. import tvm import tvm.relay as relay from tvm.contrib.target import vitis_ai - from tvm.contrib import utils, graph_runtime + from tvm.contrib import utils, graph_executor from tvm.relay.build_module import bind_params_by_name from tvm.relay.op.contrib.vitis_ai import annotation @@ -490,7 +490,7 @@ will take a substantial amount of time. .. code:: python - module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + module = graph_executor.GraphModule(lib["default"](tvm.cpu())) # First N (default = 128) inputs are used for quantization calibration and will # be executed on the CPU @@ -520,7 +520,7 @@ Load the module from compiled files and run inference # load the module into memory loaded_lib = tvm.runtime.load_module(lib_path) - module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + module = graph_executor.GraphModule(lib["default"](tvm.cpu())) module.set_input(name, data) module.run() @@ -551,7 +551,7 @@ TVM. import tvm import tvm.relay as relay from tvm.contrib.target import vitis_ai - from tvm.contrib import utils, graph_runtime + from tvm.contrib import utils, graph_executor from tvm.relay.build_module import bind_params_by_name from tvm.relay.op.contrib.vitis_ai import annotation @@ -631,7 +631,7 @@ quantization on the host machine. This makes use of TVM inference calls .. code:: python - module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + module = graph_executor.GraphModule(lib["default"](tvm.cpu())) # First N (default = 128) inputs are used for quantization calibration and will # be executed on the CPU @@ -694,7 +694,7 @@ as root (execute ``su`` in terminal to log into root). import pyxir import tvm - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor dev = tvm.cpu() @@ -704,6 +704,6 @@ as root (execute ``su`` in terminal to log into root). # load the module into memory lib = tvm.runtime.load_module("tvm_dpu_arm.so") - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, input_data) module.run() diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst index 509cfd306a4af..f1bd004717b48 100644 --- a/docs/dev/debugger.rst +++ b/docs/dev/debugger.rst @@ -123,24 +123,24 @@ Example of loading the parameters How to use Debugger? *************************************** -1. In ``config.cmake`` set the ``USE_GRAPH_RUNTIME_DEBUG`` flag to ``ON`` +1. In ``config.cmake`` set the ``USE_GRAPH_EXECUTOR_DEBUG`` flag to ``ON`` :: # Whether enable additional graph debug functions - set(USE_GRAPH_RUNTIME_DEBUG ON) + set(USE_GRAPH_EXECUTOR_DEBUG ON) 2. Do 'make' tvm, so that it will make the ``libtvm_runtime.so`` 3. In frontend script file instead of - ``from tvm.contrib import graph_runtime`` import the - ``debug_runtime`` - ``from tvm.contrib.debugger import debug_runtime as graph_runtime`` + ``from tvm.contrib import graph_executor`` import the + ``debug_executor`` + ``from tvm.contrib.debugger import debug_executor as graph_executor`` :: - from tvm.contrib.debugger import debug_runtime as graph_runtime - m = graph_runtime.create(graph, lib, dev, dump_root="/tmp/tvmdbg") + from tvm.contrib.debugger import debug_executor as graph_executor + m = graph_executor.create(graph, lib, dev, dump_root="/tmp/tvmdbg") # set inputs m.set_input('data', tvm.nd.array(data.astype(dtype))) m.set_input(**params) diff --git a/docs/dev/index.rst b/docs/dev/index.rst index 7ceed646087f4..c297d32923fed 100644 --- a/docs/dev/index.rst +++ b/docs/dev/index.rst @@ -94,7 +94,7 @@ This process helps us to divide the original problem into two sub-problems: We use the low-level tir phase to compile and optimize each sub-functions. For specific targets, we may also directly go to the target translation phase and use external code generators. -There are a few different ways(in relay/backend) to handle the calls into the overall execution problem. For simple models with known shapes and no control flow, we can lower to a graph runtime that stores the execution structure in a graph. We also support a virtual machine backend for dynamic executions. Finally, we plan to support ahead of time compilation that compiles the high-level execution structure into the executable and generated primitive functions. All of these execution modes are encapsulated by a unified **runtime.Module** interface, which we will discuss in the latter part of the guide. +There are a few different ways(in relay/backend) to handle the calls into the overall execution problem. For simple models with known shapes and no control flow, we can lower to a graph executor that stores the execution structure in a graph. We also support a virtual machine backend for dynamic executions. Finally, we plan to support ahead of time compilation that compiles the high-level execution structure into the executable and generated primitive functions. All of these execution modes are encapsulated by a unified **runtime.Module** interface, which we will discuss in the latter part of the guide. **tir/transform** contains transformation passes for TIR level functions. Many tir passes serve the purpose of lowering. For example, there are passes to flatten multi-dimensional access to one-dimensional pointer access, to expand the intrinsics into target-specific ones, and to decorate the function entry to meet the runtime calling convention. Of course, there are also optimizations passes, such as access index simplification and dead code elimination. diff --git a/docs/dev/microtvm_design.rst b/docs/dev/microtvm_design.rst index 2c3eeb2faea3c..885ef2c8fc0da 100644 --- a/docs/dev/microtvm_design.rst +++ b/docs/dev/microtvm_design.rst @@ -68,7 +68,7 @@ The parts of this process are described below: #. **Deployment**. The project is built and the residual firmware binary is flashed onto the device. Model inference is driven either by TVM using an on-device RPC server, or on the device using the - on-device Graph Runtime. + on-device Graph Executor. Design Goals ============ @@ -189,14 +189,14 @@ The TVM compiler traditionally outputs three pieces: 2. A model execution graph, encoded as JSON; and 3. Simplified parameters. -To correctly execute the model, a Graph Runtime needs to reconstruct the graph in memory, load the +To correctly execute the model, a Graph Executor needs to reconstruct the graph in memory, load the parameters, and then invoke the operator implementations in the correct order. microTVM supports two ways to do this: -1. **Host-Driven**. The Graph Runtime can run on the host and carry out execution by issuing +1. **Host-Driven**. The Graph Executor can run on the host and carry out execution by issuing commands to the device using an RPC link with a UART-like transport. -2. **Standalone**. A C Graph Runtime is available to be compiled on-device, but it is not +2. **Standalone**. A C Graph Executor is available to be compiled on-device, but it is not particularly memory efficient. This way enables standalone execution without any attached host. Host-Driven is designed for experimenting with models on-device and, like AutoTVM, uses the RPC server to @@ -213,8 +213,8 @@ In Host-Driven execution, the firmware binary is the following: 4. The TVM RPC server. 5. (optional) Simplified Parameters. -This firmware image is flashed onto the device and a GraphRuntime instance is created on the host. -The GraphRuntime drives execution by sending RPC commands over a UART: +This firmware image is flashed onto the device and a GraphExecutor instance is created on the host. +The GraphExecutor drives execution by sending RPC commands over a UART: .. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_host_driven.svg :align: center @@ -223,7 +223,7 @@ The GraphRuntime drives execution by sending RPC commands over a UART: Standalone Execution ^^^^^^^^^^^^^^^^^^^^ -In Standalone execution, the GraphRuntime is instantiated on device: +In Standalone execution, the GraphExecutor is instantiated on device: .. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_standalone.svg :align: center @@ -248,7 +248,7 @@ When configuring for host-driven inference or AutoTVM, the remaining tasks are w When configuring for standalone deployment, the firmware needs to: 1. Instantiate the system library by calling the ``runtime.SystemLib`` PackedFunc. -2. Instantiate a GraphRuntime passing the system library module. +2. Instantiate a GraphExecutor passing the system library module. 3. Configure parameters and inputs as needed. 4. Run the model. @@ -267,7 +267,7 @@ For Host-driven model execution, firmware also needs: For Standalone model execution, firmware also needs: -4. The TVM C GraphRuntime library, supplied by TVM as a static library. +4. The TVM C GraphExecutor library, supplied by TVM as a static library. 5. The remaining compiler outputs (Simplified Parameters and Graph JSON). The Automated Build Flow @@ -323,11 +323,11 @@ Future Work Ahead-of-Time Runtime ---------------------- -A limitation of the Graph Runtime is the amount of memory overhead required in parsing the JSON. +A limitation of the Graph Executor is the amount of memory overhead required in parsing the JSON. The current implementation contributes significantly to the dynamic memory usage of microTVM, limiting its utility. An ahead-of-time runtime can avoid the need for any Graph JSON parsing and improve inference speed by generating C code to call the generated operator implementations directly -rather than relying on a data-driven approach with the Graph Runtime. +rather than relying on a data-driven approach with the Graph Executor. Memory Planning ---------------- diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst index 9081d50b92efe..7826f68b71dd1 100644 --- a/docs/dev/virtual_machine.rst +++ b/docs/dev/virtual_machine.rst @@ -32,9 +32,9 @@ There are further challenges in compiling dynamic code, such as dynamic scheduli fully dynamic tensor shapes, and control flow. The interpreter offers simple solutions for these, but none is sufficiently compelling or optimized. -The second execution mechanism is the existing graph runtime. In order to target Relay +The second execution mechanism is the existing graph executor. In order to target Relay programs to this, we compile a small subset of them to the old graph format and execute -them on the runtime. Graph runtime provides a fast execution experience but only for a very limited +them on the runtime. Graph executor provides a fast execution experience but only for a very limited subset of Relay programs. An alternative but not-standard approach is Relay's ahead-of-time compiler, @@ -64,7 +64,7 @@ micro-optimizations present in scalar VMs are dramatically less important. TVM has provided strong support for vision models, but we want to grow to support a wider variety of models. -The graph runtime is able to utilize the fully static nature of the input graphs to perform +The graph executor is able to utilize the fully static nature of the input graphs to perform aggressive optimization such as fully static allocation, and optimal memory reuse. When we introduce models which make use of control flow, recursion, dynamic shapes, and dynamic allocation, we must change how execution works. A virtual machine for Relay is a natural choice. @@ -354,7 +354,7 @@ Serialization Serializing and deserializing the executable generated by the Relay VM compiler is a must as we may want to save the model to the disk and perform inference later. Previously, Relay has produced -a serialized form in a json file for the graph runtime. However, the same format is not directly +a serialized form in a json file for the graph executor. However, the same format is not directly applicable to the VM as it emits bytecode instead of graph-style programs. Serialization of an executable essentially needs to handle both model specific (i.e. weights and kernels) and VM related (i.e. bytecode and global function names) data. @@ -376,7 +376,7 @@ components in a binary format that is organized with the following sections in o - Code section. The VM functions, including bytecode, are sitting in this section. The dispatching loop iterates through this section to fetch instructions for execution. -Hence, unlike the graph runtime artifact that contains weight (.params), graph json (.json), +Hence, unlike the graph executor artifact that contains weight (.params), graph json (.json), and compiled kernel library (.so), the serialized executable artifact is composed of the Relay object file (.ro) and the compiled kernel library (.so). diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst index f6be4e31af90f..f0ebad1d9edc7 100644 --- a/docs/install/from_source.rst +++ b/docs/install/from_source.rst @@ -88,7 +88,7 @@ The configuration of TVM can be modified by `config.cmake`. - On macOS, for some versions of Xcode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors. - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. Do the same for other backends and libraries you want to build for (OpenCL, RCOM, METAL, VULKAN, ...). - - To help with debugging, ensure the embedded graph runtime and debugging functions are enabled with ``set(USE_GRAPH_RUNTIME ON)`` and ``set(USE_GRAPH_RUNTIME_DEBUG ON)`` + - To help with debugging, ensure the embedded graph executor and debugging functions are enabled with ``set(USE_GRAPH_EXECUTOR ON)`` and ``set(USE_PROFILER ON)`` - TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on. diff --git a/golang/sample/complex.go b/golang/sample/complex.go index bbe74dc85e098..911d0a7a28c1d 100644 --- a/golang/sample/complex.go +++ b/golang/sample/complex.go @@ -70,13 +70,13 @@ func main() { } jsonStr := string(bytes) - // Load module on tvm runtime - call tvm.graph_runtime.create - funp, err := gotvm.GetGlobalFunction("tvm.graph_runtime.create") + // Load module on tvm runtime - call tvm.graph_executor.create + funp, err := gotvm.GetGlobalFunction("tvm.graph_executor.create") if err != nil { fmt.Print(err) return } - fmt.Printf("Calling tvm.graph_runtime.create\n") + fmt.Printf("Calling tvm.graph_executor.create\n") // Call function graphrt, err := funp.Invoke(jsonStr, modp, (int64)(gotvm.KDLCPU), (int64)(0)) if err != nil { @@ -84,7 +84,7 @@ func main() { return } graphmod := graphrt.AsModule() - fmt.Printf("Graph runtime Created\n") + fmt.Printf("Graph executor Created\n") // Array allocation attributes tshapeIn := []int64{1, 224, 224, 3} @@ -105,7 +105,7 @@ func main() { } fmt.Printf("Input and Output Arrays allocated\n") - // Get module function from graph runtime : load_params + // Get module function from graph executor : load_params // Read params bytes, err = ioutil.ReadFile(modParams) if err != nil { diff --git a/golang/src/function_test.go b/golang/src/function_test.go index 17b1c9a6e1c04..0830d16419a2b 100644 --- a/golang/src/function_test.go +++ b/golang/src/function_test.go @@ -46,7 +46,7 @@ func TestFunctionGlobals(t *testing.T) { // Check GetFunction API func TestFunctionGlobalGet(t *testing.T) { - funp, err := GetGlobalFunction("tvm.graph_runtime.create") + funp, err := GetGlobalFunction("tvm.graph_executor.create") if err != nil { t.Error(err.Error()) return diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc index 7dd6dd5e94c52..430e046e39a88 100644 --- a/golang/src/tvm_runtime_pack.cc +++ b/golang/src/tvm_runtime_pack.cc @@ -42,8 +42,8 @@ #include "src/runtime/dso_library.cc" #include "src/runtime/system_library.cc" -// Graph runtime -#include "src/runtime/graph/graph_runtime.cc" +// Graph executor +#include "src/runtime/graph_executor/graph_executor.cc" // Uncomment the following lines to enable RPC // #include "../../src/runtime/rpc/rpc_session.cc" diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h index 75e49e63e0946..4cbfb0aab8e2b 100644 --- a/include/tvm/runtime/crt/error_codes.h +++ b/include/tvm/runtime/crt/error_codes.h @@ -42,7 +42,7 @@ typedef enum { kTvmErrorCategorySession = 4, kTvmErrorCategoryPlatform = 5, kTvmErrorCategoryGenerated = 6, - kTvmErrorCategoryGraphRuntime = 7, + kTvmErrorCategoryGraphExecutor = 7, kTvmErrorCategoryFunctionCall = 8, kTvmErrorCategoryTimeEvaluator = 9, } tvm_crt_error_category_t; @@ -83,10 +83,10 @@ typedef enum { // Common error codes returned from generated functions. kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0), - // Graph runtime - kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 0), - kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 1), - kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 2), + // Graph executor + kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 0), + kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 1), + kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 2), // Function Calls - common problems encountered calling functions. kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0), diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_executor.h similarity index 61% rename from include/tvm/runtime/crt/graph_runtime.h rename to include/tvm/runtime/crt/graph_executor.h index 02c7421d00e8b..eb68ff56d2301 100644 --- a/include/tvm/runtime/crt/graph_runtime.h +++ b/include/tvm/runtime/crt/graph_executor.h @@ -18,11 +18,11 @@ */ /*! - * \file graph_runtime.h - * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc. + * \file graph_executor.h + * \brief Tiny graph executor that can run graph containing only tvm PackedFunc. */ -#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_ -#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_ +#ifndef TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_ +#define TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_ #ifdef __cplusplus extern "C" { @@ -43,7 +43,7 @@ typedef struct TVMOpParam { } TVMOpParam; // Graph attribute -typedef struct TVMGraphRuntimeGraphAttr { +typedef struct TVMGraphExecutorGraphAttr { uint32_t storage_num_not_alloctaed; uint32_t* storage_id; uint32_t* device_index; @@ -52,79 +52,79 @@ typedef struct TVMGraphRuntimeGraphAttr { int64_t* shape; uint32_t* ndim; uint32_t shape_count; -} TVMGraphRuntimeGraphAttr; +} TVMGraphExecutorGraphAttr; -typedef struct TVMGraphRuntime TVMGraphRuntime; +typedef struct TVMGraphExecutor TVMGraphExecutor; // public functions /*! - * \brief Allocate a new GraphRuntime with TVMPlatformMemoryAllocate and initialize it. + * \brief Allocate a new GraphExecutor with TVMPlatformMemoryAllocate and initialize it. * * \param sym_json JSON-encoded graph. * \param module_handle TVM Module that exposes the functions to call. * \param devices runtime execution device. - * \param runtime Pointer which receives a pointer to the newly-created instance. + * \param executor Pointer which receives a pointer to the newly-created instance. * \return 0 if successful. */ -int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle, - const DLDevice* devices, TVMGraphRuntime** runtime); +int TVMGraphExecutor_Create(const char* sym_json, TVMModuleHandle module_handle, + const DLDevice* devices, TVMGraphExecutor** executor); -int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name); +int TVMGraphExecutor_GetInputIndex(TVMGraphExecutor* executor, const char* name); /*! * \brief get number of input tensors allocated. * \return integer number of tensors available to use. */ -int TVMGraphRuntime_GetNumInputs(); +int TVMGraphExecutor_GetNumInputs(); /*! * \brief set input to the graph based on name. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param name The name of the input. * \param data_in The input data. */ -void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in); +void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in); /*! * \brief get number of output tensors allocated. * \return integer number of output tensors allocated. */ -int TVMGraphRuntime_GetNumOutputs(); +int TVMGraphExecutor_GetNumOutputs(); /*! * \brief Return NDArray for given output index. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param index The output index. * \param out The DLTensor corresponding to given output node index. * \return The result of this function execution. */ -int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t index, DLTensor* out); +int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t index, DLTensor* out); /*! * \brief Load parameters from parameter blob. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param param_blob A binary blob of parameter. * \param param_size The parameter size. * \return The result of this function execution. */ -int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, - const uint32_t param_size); +int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob, + const uint32_t param_size); /*! * \brief Execute the graph. - * \param runtime The graph runtime. + * \param executor The graph executor. */ -void TVMGraphRuntime_Run(TVMGraphRuntime* runtime); +void TVMGraphExecutor_Run(TVMGraphExecutor* executor); /*! - * \brief Release memory associated with the graph runtime. - * \param runtime Pointer to graph runtime. + * \brief Release memory associated with the graph executor. + * \param executor Pointer to graph executor. * \return 0 if successful */ -int TVMGraphRuntime_Release(TVMGraphRuntime** runtime); +int TVMGraphExecutor_Release(TVMGraphExecutor** executor); #ifdef __cplusplus } // extern "C" #endif -#endif // TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_ +#endif // TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_ diff --git a/include/tvm/runtime/crt/graph_runtime_module.h b/include/tvm/runtime/crt/graph_executor_module.h similarity index 71% rename from include/tvm/runtime/crt/graph_runtime_module.h rename to include/tvm/runtime/crt/graph_executor_module.h index 04e9184c8b8d1..10a879e9ba307 100644 --- a/include/tvm/runtime/crt/graph_runtime_module.h +++ b/include/tvm/runtime/crt/graph_executor_module.h @@ -18,11 +18,11 @@ */ /*! - * \file graph_runtime.h - * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc. + * \file graph_executor.h + * \brief Tiny graph executor that can run graph containing only tvm PackedFunc. */ -#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_ -#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_ +#ifndef TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_ +#define TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_ #ifdef __cplusplus extern "C" { @@ -31,12 +31,12 @@ extern "C" { #include /*! - * \brief Register the "tvm.graph_runtime.create" constructor PackedFunc. + * \brief Register the "tvm.graph_executor.create" constructor PackedFunc. */ -tvm_crt_error_t TVMGraphRuntimeModule_Register(); +tvm_crt_error_t TVMGraphExecutorModule_Register(); #ifdef __cplusplus } // extern "C" #endif -#endif // TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_ +#endif // TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_ diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java similarity index 95% rename from jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java rename to jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java index 5bbd2beb4644d..30b2fb1acafb5 100644 --- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java +++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java @@ -29,7 +29,7 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -public class GraphRuntime { +public class GraphExecutor { /** * Create a runtime executor module given a graph and module. * @param graphJson The graph deployed in json format output by compiler. @@ -38,9 +38,9 @@ public class GraphRuntime { * @return Runtime graph module that can be used to execute the graph. */ public static GraphModule create(String graphJson, Module libmod, Device dev) { - Function fcreate = Function.getFunction("tvm.graph_runtime.create"); + Function fcreate = Function.getFunction("tvm.graph_executor.create"); if (fcreate == null) { - throw new RuntimeException("Cannot find global function tvm.graph_runtime.create." + throw new RuntimeException("Cannot find global function tvm.graph_executor.create." + "Did you compile tvm_runtime with correct version?"); } Module graphModule = fcreate.pushArg(graphJson) diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java index 0e2583553ed3f..a7a03d52740e5 100644 --- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java +++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java @@ -147,7 +147,7 @@ public NDArray debugGetOutput(String node, NDArray out) { if (fdebugGetOutput != null) { fdebugGetOutput.pushArg(node).pushArg(out).invoke(); } else { - throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0"); + throw new RuntimeException("Please compile runtime with USE_GRAPH_EXECUTOR_DEBUG = 0"); } return out; } @@ -162,7 +162,7 @@ public NDArray debugGetOutput(int node, NDArray out) { if (fdebugGetOutput != null) { fdebugGetOutput.pushArg(node).pushArg(out).invoke(); } else { - throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0"); + throw new RuntimeException("Please compile runtime with USE_GRAPH_EXECUTOR_DEBUG = 0"); } return out; } diff --git a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java similarity index 93% rename from jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java rename to jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java index d1760454a468b..0a5fa9a67e3a8 100644 --- a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java +++ b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java @@ -35,8 +35,8 @@ import static org.junit.Assert.assertArrayEquals; -public class GraphRuntimeTest { - private final Logger logger = LoggerFactory.getLogger(GraphRuntime.class); +public class GraphExecutorTest { + private final Logger logger = LoggerFactory.getLogger(GraphExecutor.class); private static String loadingDir; @BeforeClass @@ -52,7 +52,7 @@ public void test_add_one_local() throws IOException { .useDelimiter("\\Z").next(); Device dev = Device.cpu(); - GraphModule graph = GraphRuntime.create(graphJson, libmod, dev); + GraphModule graph = GraphExecutor.create(graphJson, libmod, dev); long[] shape = new long[]{4}; NDArray arr = NDArray.empty(shape, dev); @@ -92,7 +92,7 @@ public void test_add_one_remote() throws IOException { remote.upload(new File(libPath)); Module mlib = remote.loadModule("graph_addone_lib.so"); - GraphModule graph = GraphRuntime.create(graphJson, mlib, dev); + GraphModule graph = GraphExecutor.create(graphJson, mlib, dev); long[] shape = new long[]{4}; NDArray arr = NDArray.empty(shape, dev); diff --git a/jvm/core/src/test/scripts/test_graph_runtime.py b/jvm/core/src/test/scripts/test_graph_executor.py similarity index 98% rename from jvm/core/src/test/scripts/test_graph_runtime.py rename to jvm/core/src/test/scripts/test_graph_executor.py index 07a19fe50c1bd..676b008205caf 100644 --- a/jvm/core/src/test/scripts/test_graph_runtime.py +++ b/jvm/core/src/test/scripts/test_graph_executor.py @@ -19,7 +19,7 @@ import tvm from tvm import te import json -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor def dump_graph_lib(target_dir): diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 366d3d021d9e5..72e3e06f2047b 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -47,7 +47,7 @@ def call_all_topi_funcs(mod, params, target): """Call all TOPI compute to extract auto_scheduler tasks in a Relay program""" # pylint: disable=import-outside-toplevel from tvm import relay - from tvm.relay.backend import graph_runtime_codegen + from tvm.relay.backend import graph_executor_codegen # Turn off AutoTVM config not found warnings old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent @@ -63,11 +63,11 @@ def call_all_topi_funcs(mod, params, target): ): try: opt_mod, _ = relay.optimize(mod, target, params) - grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) + grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(opt_mod["main"]) except tvm.TVMError: print( - "Get errors with GraphRuntimeCodegen for task extraction. " + "Get errors with GraphExecutorCodegen for task extraction. " "Fallback to VMCompiler." ) compiler = relay.vm.VMCompiler() diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index fe88d1741d603..12e057e01da62 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -36,14 +36,14 @@ def _lower(mod, target, params): """Helper to lower VTA properly.""" # pylint: disable=import-outside-toplevel from tvm import relay - from tvm.relay.backend import graph_runtime_codegen + from tvm.relay.backend import graph_executor_codegen if hasattr(target, "device_name") and target.device_name == "vta": import vta with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): mod, _ = relay.optimize(mod, target, params) - grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) + grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(mod["main"]) return @@ -53,11 +53,11 @@ def _lower(mod, target, params): # TODO: Currently VM compiler is likely to stack overflow for large models. try: opt_mod, _ = relay.optimize(mod, target, params) - grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) + grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(opt_mod["main"]) except tvm.TVMError as e: print( - "Get errors with GraphRuntimeCodegen for task extraction. " + "Get errors with GraphExecutorCodegen for task extraction. " "Fallback to VMCompiler. Error details:\n%s" % str(e) ) compiler = relay.vm.VMCompiler() diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py similarity index 78% rename from python/tvm/contrib/cuda_graph/cuda_graph_runtime.py rename to python/tvm/contrib/cuda_graph/cuda_graph_executor.py index c2f4455d881b9..d047316eb5645 100644 --- a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py +++ b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py @@ -14,11 +14,11 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Graph runtime with CUDA Graph""" +"""Graph executor with CUDA Graph""" import tvm._ffi from tvm._ffi.base import string_types -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor def create(graph_json_str, libmod, device): @@ -40,34 +40,34 @@ def create(graph_json_str, libmod, device): Returns ------- graph_module : GraphModuleCudaGraph - CUDA graph runtime module that can be used to execute the graph. + CUDA graph executor module that can be used to execute the graph. Note ---- - See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_runtime.GraphModuleCudaGraph` + See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_executor.GraphModuleCudaGraph` for examples to directly construct a GraphModuleCudaGraph from an exported relay compiled library. """ assert isinstance(graph_json_str, string_types) try: - dev, num_rpc_dev, device_type_id = graph_runtime.get_device(libmod, device) + dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device) if num_rpc_dev == len(dev): - fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create") + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_cuda_graph.create") else: - fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cuda_graph.create") + fcreate = tvm._ffi.get_global_func("tvm.graph_executor_cuda_graph.create") except ValueError: raise ValueError( "To enable CUDA graph support (experimental), please set " - "'(USE_GRAPH_RUNTIME_CUGRAPH ON)' in config.cmake and rebuild TVM" + "'(USE_GRAPH_EXECUTOR_CUGRAPH ON)' in config.cmake and rebuild TVM" ) return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id)) -class GraphModuleCudaGraph(graph_runtime.GraphModule): - """CUDA graph runtime module. +class GraphModuleCudaGraph(graph_executor.GraphModule): + """CUDA graph executor module. - This is a CUDA graph runtime wrapper over the TVM runtime. + This is a CUDA graph executor wrapper over the TVM runtime. Runtime interfaces are wrapped with CUDA graph functionalities. Parameters @@ -81,7 +81,7 @@ def __init__(self, module): self._end_capture = module["end_capture"] self._run_cuda_graph = module["run_cuda_graph"] self._cuda_graph_captured = False - graph_runtime.GraphModule.__init__(self, module) + graph_executor.GraphModule.__init__(self, module) def capture_cuda_graph(self): """Capture a CUDA graph for tvm_op graph @@ -99,13 +99,13 @@ def run_cuda_graph(self): """Run the CUDA graph for tvm_op graph Run the captured CUDA graph instance instead of the - for-loop kernel launch of default graph runtime + for-loop kernel launch of default graph executor """ self._run_cuda_graph() def run(self, **input_dict): """A run wrapper for graph capture / launch, user can just - change default graph runtime to cuda graph runtime, and + change default graph executor to cuda graph executor, and the first call will capture a cuda graph for future launch Parameters @@ -131,4 +131,4 @@ def debug_get_output(self, node, out): out : NDArray The output array container """ - raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.") + raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.") diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py new file mode 100644 index 0000000000000..b27ae6533e385 --- /dev/null +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -0,0 +1,239 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Graph debug runtime executes TVM debug packed functions.""" + +import os +import tempfile +import shutil +import tvm._ffi + +from tvm._ffi.base import string_types +from tvm.contrib import graph_executor +from tvm.runtime.ndarray import array +from . import debug_result + +_DUMP_ROOT_PREFIX = "tvmdbg_" +_DUMP_PATH_PREFIX = "_tvmdbg_" + + +def create(graph_json_str, libmod, device, dump_root=None): + """Create a runtime executor module given a graph and module. + + Parameters + ---------- + graph_json_str : str + The graph to be deployed in json format output by graph compiler. + The graph can contain operator(tvm_op) that points to the name + of PackedFunc in the libmod. + + libmod : tvm.Module + The module of the corresponding function. + + device : Device + The device to deploy the module, can be local or remote. + + dump_root : str + To select which folder the outputs should be kept. + None will make a temp folder in /tmp/tvmdbg and does the dumping + Returns + ------- + graph_module : GraphModuleDebug + Debug Runtime graph module that can be used to execute the graph. + """ + assert isinstance(graph_json_str, string_types) + + try: + dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device) + if num_rpc_dev == len(dev): + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_debug.create") + else: + fcreate = tvm._ffi.get_global_func("tvm.graph_executor_debug.create") + except ValueError: + raise ValueError( + "Please set '(USE_GRAPH_EXECUTOR_DEBUG ON)' in " + "config.cmake and rebuild TVM to enable debug mode" + ) + func_obj = fcreate(graph_json_str, libmod, *device_type_id) + return GraphModuleDebug(func_obj, dev, graph_json_str, dump_root) + + +class GraphModuleDebug(graph_executor.GraphModule): + """Graph debug runtime module. + + This is a debug wrapper over the TVM runtime. + Runtime interfaces are wrapped with debug functionalities. + Manage the debug framework to format the debug data and + trigger the user interfaces. + + Parameters + ---------- + module : Module + The internal tvm module that holds the actual graph functions. + + device : Device + The device that this module is under. + + graph_json_str : str or graph class + Content of graph json file in string format + + dump_root : str + To select which folder the outputs should be kept. + None will make a temp folder in /tmp/tvmdbg and does the dumping + """ + + def __init__(self, module, device, graph_json_str, dump_root): + self._dump_root = dump_root + self._dump_path = None + self._get_output_by_layer = module["get_output_by_layer"] + self._run_individual = module["run_individual"] + graph_executor.GraphModule.__init__(self, module) + self._create_debug_env(graph_json_str, device) + + def _format_device(self, device): + return str(device[0]).upper().replace("(", ":").replace(")", "") + + def _ensure_dir(self, directory): + """Create a directory if not exists + + Parameters + ---------- + + directory : str + File path to create + """ + if not os.path.exists(directory): + os.makedirs(directory, 0o700) + + def _get_dump_path(self, device): + """Make the graph and tensor dump folder and return the path. + + Parameters + ---------- + device : Device + The device that this module is under. + + Returns + ------- + path : str + Directory path where the graph and node outputs will be stored. + """ + # save to file + folder_name = _DUMP_PATH_PREFIX + "device_" + folder_name = folder_name + device.replace(":", "_") + path = os.path.join(self._dump_root, folder_name) + self._ensure_dir(path) + return path + + def _remove_dump_root(self): + if os.path.isdir(self._dump_root): + shutil.rmtree(self._dump_root) + + def _create_debug_env(self, graph_json, device): + """Create UI wrapper framework to handle multiple UI frontends for tvmdbg + + Parameters + ---------- + graph_json : json format + json formatted NNVM graph contain list of each node's name, shape and type. + + nodes_list : list + List of all the nodes presented in the graph + + device : Device + The device that this module is under. + """ + # make the dump folder if not given + if not self._dump_root: + self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX) + + # format the device + device = self._format_device(device) + + # updates the dumping directories + self._dump_path = self._get_dump_path(device) + + # init the debug dumping environment + self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path) + + def _run_debug(self): + """Execute the node specified with index will be executed. + Each debug output will be copied to the buffer + Time consumed for each execution will be set as debug output. + + """ + self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)] + for i, node in enumerate(self.debug_datum.get_graph_nodes()): + num_outputs = self.debug_datum.get_graph_node_output_num(node) + for j in range(num_outputs): + out_tensor = self._get_output_by_layer(i, j) + out_tensor = array(out_tensor) + self.debug_datum._output_tensor_list.append(out_tensor) + + def debug_get_output(self, node, out=None): + """Run graph up to node and get the output to out + + Parameters + ---------- + node : int / str + The node index or name + + out : NDArray + The output array container + """ + if isinstance(node, str): + output_tensors = self.debug_datum.get_output_tensors() + try: + out = output_tensors[node] + except KeyError: + node_list = output_tensors.keys() + raise RuntimeError( + "Node " + node + " not found, available nodes are: " + str(node_list) + "." + ) + elif isinstance(node, int): + output_tensors = self.debug_datum._output_tensor_list + out = output_tensors[node] + else: + raise RuntimeError("Require node index or name only.") + return out + + def run(self, **input_dict): + """Run forward execution of the graph with debug + + Parameters + ---------- + input_dict : dict of str to NDArray + List of input values to be feed to + """ + if input_dict: + self.set_input(**input_dict) + + # Step 1. Execute the graph + self._run_debug() + # Step 2. Dump the output tensors to the dump folder + self.debug_datum.dump_output_tensor() + # Step 3. Dump the Chrome trace to the dump folder + self.debug_datum.dump_chrome_trace() + # Step 4. Display the collected information + self.debug_datum.display_debug_result() + + def run_individual(self, number, repeat=1, min_repeat_ms=0): + ret = self._run_individual(number, repeat, min_repeat_ms) + return ret.strip(",").split(",") if ret else [] + + def exit(self): + """Exits the dump folder and all its contents""" + self._remove_dump_root() diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py index f0e299728fbc5..ebd903b475709 100644 --- a/python/tvm/contrib/debugger/debug_runtime.py +++ b/python/tvm/contrib/debugger/debug_runtime.py @@ -14,226 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Graph debug runtime executes TVM debug packed functions.""" +"""Deprecated Python API for DebugExecutor.""" -import os -import tempfile -import shutil -import tvm._ffi +import warnings -from tvm._ffi.base import string_types -from tvm.contrib import graph_runtime -from tvm.runtime.ndarray import array -from . import debug_result +from . import debug_executor -_DUMP_ROOT_PREFIX = "tvmdbg_" -_DUMP_PATH_PREFIX = "_tvmdbg_" - -def create(graph_json_str, libmod, device, dump_root=None): - """Create a runtime executor module given a graph and module. - - Parameters - ---------- - graph_json_str : str - The graph to be deployed in json format output by graph compiler. - The graph can contain operator(tvm_op) that points to the name - of PackedFunc in the libmod. - - libmod : tvm.Module - The module of the corresponding function. - - device : Device - The device to deploy the module, can be local or remote. - - dump_root : str - To select which folder the outputs should be kept. - None will make a temp folder in /tmp/tvmdbg and does the dumping - Returns - ------- - graph_module : GraphModuleDebug - Debug Runtime graph module that can be used to execute the graph. - """ - assert isinstance(graph_json_str, string_types) - - try: - dev, num_rpc_dev, device_type_id = graph_runtime.get_device(libmod, device) - if num_rpc_dev == len(dev): - fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime_debug.create") - else: - fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_debug.create") - except ValueError: - raise ValueError( - "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " - "config.cmake and rebuild TVM to enable debug mode" - ) - func_obj = fcreate(graph_json_str, libmod, *device_type_id) - return GraphModuleDebug(func_obj, dev, graph_json_str, dump_root) - - -class GraphModuleDebug(graph_runtime.GraphModule): - """Graph debug runtime module. - - This is a debug wrapper over the TVM runtime. - Runtime interfaces are wrapped with debug functionalities. - Manage the debug framework to format the debug data and - trigger the user interfaces. - - Parameters - ---------- - module : Module - The internal tvm module that holds the actual graph functions. - - device : Device - The device that this module is under. - - graph_json_str : str or graph class - Content of graph json file in string format - - dump_root : str - To select which folder the outputs should be kept. - None will make a temp folder in /tmp/tvmdbg and does the dumping - """ - - def __init__(self, module, device, graph_json_str, dump_root): - self._dump_root = dump_root - self._dump_path = None - self._get_output_by_layer = module["get_output_by_layer"] - self._run_individual = module["run_individual"] - graph_runtime.GraphModule.__init__(self, module) - self._create_debug_env(graph_json_str, device) - - def _format_device(self, device): - return str(device[0]).upper().replace("(", ":").replace(")", "") - - def _ensure_dir(self, directory): - """Create a directory if not exists - - Parameters - ---------- - - directory : str - File path to create - """ - if not os.path.exists(directory): - os.makedirs(directory, 0o700) - - def _get_dump_path(self, device): - """Make the graph and tensor dump folder and return the path. - - Parameters - ---------- - device : Device - The device that this module is under. - - Returns - ------- - path : str - Directory path where the graph and node outputs will be stored. - """ - # save to file - folder_name = _DUMP_PATH_PREFIX + "device_" - folder_name = folder_name + device.replace(":", "_") - path = os.path.join(self._dump_root, folder_name) - self._ensure_dir(path) - return path - - def _remove_dump_root(self): - if os.path.isdir(self._dump_root): - shutil.rmtree(self._dump_root) - - def _create_debug_env(self, graph_json, device): - """Create UI wrapper framework to handle multiple UI frontends for tvmdbg - - Parameters - ---------- - graph_json : json format - json formatted NNVM graph contain list of each node's name, shape and type. - - nodes_list : list - List of all the nodes presented in the graph - - device : Device - The device that this module is under. - """ - # make the dump folder if not given - if not self._dump_root: - self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX) - - # format the device - device = self._format_device(device) - - # updates the dumping directories - self._dump_path = self._get_dump_path(device) - - # init the debug dumping environment - self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path) - - def _run_debug(self): - """Execute the node specified with index will be executed. - Each debug output will be copied to the buffer - Time consumed for each execution will be set as debug output. - - """ - self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)] - for i, node in enumerate(self.debug_datum.get_graph_nodes()): - num_outputs = self.debug_datum.get_graph_node_output_num(node) - for j in range(num_outputs): - out_tensor = self._get_output_by_layer(i, j) - out_tensor = array(out_tensor) - self.debug_datum._output_tensor_list.append(out_tensor) - - def debug_get_output(self, node, out=None): - """Run graph up to node and get the output to out - - Parameters - ---------- - node : int / str - The node index or name - - out : NDArray - The output array container - """ - if isinstance(node, str): - output_tensors = self.debug_datum.get_output_tensors() - try: - out = output_tensors[node] - except KeyError: - node_list = output_tensors.keys() - raise RuntimeError( - "Node " + node + " not found, available nodes are: " + str(node_list) + "." - ) - elif isinstance(node, int): - output_tensors = self.debug_datum._output_tensor_list - out = output_tensors[node] - else: - raise RuntimeError("Require node index or name only.") - return out - - def run(self, **input_dict): - """Run forward execution of the graph with debug - - Parameters - ---------- - input_dict : dict of str to NDArray - List of input values to be feed to - """ - if input_dict: - self.set_input(**input_dict) - - # Step 1. Execute the graph - self._run_debug() - # Step 2. Dump the output tensors to the dump folder - self.debug_datum.dump_output_tensor() - # Step 3. Dump the Chrome trace to the dump folder - self.debug_datum.dump_chrome_trace() - # Step 4. Display the collected information - self.debug_datum.display_debug_result() - - def run_individual(self, number, repeat=1, min_repeat_ms=0): - ret = self._run_individual(number, repeat, min_repeat_ms) - return ret.strip(",").split(",") if ret else [] - - def exit(self): - """Exits the dump folder and all its contents""" - self._remove_dump_root() +def create(*args, **kwargs): + warnings.warn( + "This function has been moved to tvm.contrib.graph_executor and will be removed " + "in the next TVM release" + ) + return debug_executor.create(*args, **kwargs) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py new file mode 100644 index 0000000000000..a4bc85905f5e7 --- /dev/null +++ b/python/tvm/contrib/graph_executor.py @@ -0,0 +1,306 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Minimum graph executor that executes graph containing TVM PackedFunc.""" +import numpy as np +import tvm._ffi + +from tvm.rpc import _ffi_api as _rpc_ffi_api +from tvm.rpc import base as rpc_base +from tvm._ffi.base import string_types +from tvm._ffi.runtime_ctypes import Device + + +def create(graph_json_str, libmod, device): + """Create a runtime executor module given a graph and module. + + Parameters + ---------- + graph_json_str : str + The graph to be deployed in json format output by json graph. + The graph can contain operator(tvm_op) that points to the name + of PackedFunc in the libmod. + + libmod : tvm.runtime.Module + The module of the corresponding function + + device : Device or list of Device + The device to deploy the module. It can be local or remote when there + is only one Device. Otherwise, the first device in the list will + be used as this purpose. All device should be given for heterogeneous + execution. + + Returns + ------- + graph_module : GraphModule + Runtime graph module that can be used to execute the graph. + + Note + ---- + See also :py:class:`tvm.contrib.graph_executor.GraphModule` + for examples to directly construct a GraphModule from an exported + relay compiled library. + """ + assert isinstance(graph_json_str, string_types) + + dev, num_rpc_dev, device_type_id = get_device(libmod, device) + + if num_rpc_dev == len(dev): + fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor.create") + else: + fcreate = tvm._ffi.get_global_func("tvm.graph_executor.create") + + return GraphModule(fcreate(graph_json_str, libmod, *device_type_id)) + + +def get_device(libmod, device): + """Parse and validate all the device(s). + + Parameters + ---------- + libmod : tvm.runtime.Module + The module of the corresponding function + + device : Device or list of Device + + Returns + ------- + device : list of Device + num_rpc_dev : Number of rpc devices + device_type_id : List of device type and device id + """ + + if isinstance(device, Device): + device = [device] + elif not isinstance(device, (list, tuple)): + raise ValueError("dev has to be the type of Device or a list of Device") + for cur_dev in device: + if not isinstance(cur_dev, Device): + raise ValueError("dev has to be the type of Device or a list of Device") + + # device_type_id[0], device_type_id[1] are used as the primary/fallback + # device type and id. All other ones are used as device for + # heterogeneous execution. + num_rpc_dev = 0 + device_type_id = [] + for cur_dev in device: + device_type = cur_dev.device_type + if device_type >= rpc_base.RPC_SESS_MASK: + assert libmod.type_key == "rpc" + assert _rpc_ffi_api.SessTableIndex(libmod) == cur_dev._rpc_sess._tbl_index + num_rpc_dev += 1 + device_type = cur_dev.device_type % rpc_base.RPC_SESS_MASK + device_type_id.append(device_type) + device_type_id.append(cur_dev.device_id) + + if 0 < num_rpc_dev < len(device): + raise ValueError("Either all or none of the devices should be rpc.") + return device, num_rpc_dev, device_type_id + + +class GraphModule(object): + """Wrapper runtime module. + + This is a thin wrapper of the underlying TVM module. + you can also directly call set_input, run, and get_output + of underlying module functions + + Parameters + ---------- + module : tvm.runtime.Module + The internal tvm module that holds the actual graph functions. + + Attributes + ---------- + module : tvm.runtime.Module + The internal tvm module that holds the actual graph functions. + + Examples + -------- + + .. code-block:: python + + import tvm + from tvm import relay + from tvm.contrib import graph_executor + + # build the library using graph executor + lib = relay.build(...) + lib.export_library("compiled_lib.so") + # load it back as a runtime + lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so") + # Call the library factory function for default and create + # a new runtime.Module, wrap with graph module. + gmod = graph_executor.GraphModule(lib["default"](dev)) + # use the graph module. + gmod.set_input("x", data) + gmod.run() + """ + + def __init__(self, module): + self.module = module + self._set_input = module["set_input"] + self._run = module["run"] + self._get_output = module["get_output"] + self._get_input = module["get_input"] + self._get_num_outputs = module["get_num_outputs"] + self._get_num_inputs = module["get_num_inputs"] + self._load_params = module["load_params"] + self._share_params = module["share_params"] + + def set_input(self, key=None, value=None, **params): + """Set inputs to the module via kwargs + + Parameters + ---------- + key : int or str + The input key + + value : the input value. + The input key + + params : dict of str to NDArray + Additional arguments + """ + if key is not None: + v = self._get_input(key) + if v is None: + raise RuntimeError("Could not find '%s' in graph's inputs" % key) + v.copyfrom(value) + + if params: + # upload big arrays first to avoid memory issue in rpc mode + keys = list(params.keys()) + keys.sort(key=lambda x: -np.prod(params[x].shape)) + for k in keys: + # TODO(zhiics) Skip the weights for submodule in a better way. + # We should use MetadataModule for initialization and remove + # params from set_input + val = self._get_input(k) + if val: + self._get_input(k).copyfrom(params[k]) + + def run(self, **input_dict): + """Run forward execution of the graph + + Parameters + ---------- + input_dict: dict of str to NDArray + List of input values to be feed to + """ + if input_dict: + self.set_input(**input_dict) + self._run() + + def get_num_outputs(self): + """Get the number of outputs from the graph + + Returns + ------- + count : int + The number of outputs. + """ + return self._get_num_outputs() + + def get_num_inputs(self): + """Get the number of inputs to the graph + + Returns + ------- + count : int + The number of inputs. + """ + return self._get_num_inputs() + + def get_input(self, index, out=None): + """Get index-th input to out + + Parameters + ---------- + index : int + The input index + + out : NDArray + The output array container + """ + if out: + self._get_input(index).copyto(out) + return out + + return self._get_input(index) + + def get_output(self, index, out=None): + """Get index-th output to out + + Parameters + ---------- + index : int + The output index + + out : NDArray + The output array container + """ + if out: + self._get_output(index, out) + return out + + return self._get_output(index) + + def debug_get_output(self, node, out): + """Run graph up to node and get the output to out + + Parameters + ---------- + node : int / str + The node index or name + + out : NDArray + The output array container + """ + raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.") + + def load_params(self, params_bytes): + """Load parameters from serialized byte array of parameter dict. + + Parameters + ---------- + params_bytes : bytearray + The serialized parameter dict. + """ + self._load_params(bytearray(params_bytes)) + + def share_params(self, other, params_bytes): + """Share parameters from pre-existing GraphExecutor instance. + + Parameters + ---------- + other: GraphExecutor + The parent GraphExecutor from which this instance should share + it's parameters. + params_bytes : bytearray + The serialized parameter dict (used only for the parameter names). + """ + self._share_params(other.module, bytearray(params_bytes)) + + def __getitem__(self, key): + """Get internal module function + + Parameters + ---------- + key : str + The key to the module. + """ + return self.module[key] diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 2eea188be9779..f8ecfdd70a5bb 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -14,293 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Minimum graph runtime that executes graph containing TVM PackedFunc.""" -import numpy as np -import tvm._ffi +"""Deprecated Python API for GraphExecutor.""" -from tvm.rpc import _ffi_api as _rpc_ffi_api -from tvm.rpc import base as rpc_base -from tvm._ffi.base import string_types -from tvm._ffi.runtime_ctypes import Device +import warnings +from . import graph_executor -def create(graph_json_str, libmod, device): - """Create a runtime executor module given a graph and module. - Parameters - ---------- - graph_json_str : str - The graph to be deployed in json format output by json graph. - The graph can contain operator(tvm_op) that points to the name - of PackedFunc in the libmod. - - libmod : tvm.runtime.Module - The module of the corresponding function - - device : Device or list of Device - The device to deploy the module. It can be local or remote when there - is only one Device. Otherwise, the first device in the list will - be used as this purpose. All device should be given for heterogeneous - execution. - - Returns - ------- - graph_module : GraphModule - Runtime graph module that can be used to execute the graph. - - Note - ---- - See also :py:class:`tvm.contrib.graph_runtime.GraphModule` - for examples to directly construct a GraphModule from an exported - relay compiled library. - """ - assert isinstance(graph_json_str, string_types) - - dev, num_rpc_dev, device_type_id = get_device(libmod, device) - - if num_rpc_dev == len(dev): - fcreate = dev[0]._rpc_sess.get_function("tvm.graph_runtime.create") - else: - fcreate = tvm._ffi.get_global_func("tvm.graph_runtime.create") - - return GraphModule(fcreate(graph_json_str, libmod, *device_type_id)) - - -def get_device(libmod, device): - """Parse and validate all the device(s). - - Parameters - ---------- - libmod : tvm.runtime.Module - The module of the corresponding function - - device : Device or list of Device - - Returns - ------- - device : list of Device - num_rpc_dev : Number of rpc devices - device_type_id : List of device type and device id - """ - - if isinstance(device, Device): - device = [device] - elif not isinstance(device, (list, tuple)): - raise ValueError("dev has to be the type of Device or a list of Device") - for cur_dev in device: - if not isinstance(cur_dev, Device): - raise ValueError("dev has to be the type of Device or a list of Device") - - # device_type_id[0], device_type_id[1] are used as the primary/fallback - # device type and id. All other ones are used as device for - # heterogeneous execution. - num_rpc_dev = 0 - device_type_id = [] - for cur_dev in device: - device_type = cur_dev.device_type - if device_type >= rpc_base.RPC_SESS_MASK: - assert libmod.type_key == "rpc" - assert _rpc_ffi_api.SessTableIndex(libmod) == cur_dev._rpc_sess._tbl_index - num_rpc_dev += 1 - device_type = cur_dev.device_type % rpc_base.RPC_SESS_MASK - device_type_id.append(device_type) - device_type_id.append(cur_dev.device_id) - - if 0 < num_rpc_dev < len(device): - raise ValueError("Either all or none of the devices should be rpc.") - return device, num_rpc_dev, device_type_id - - -class GraphModule(object): - """Wrapper runtime module. - - This is a thin wrapper of the underlying TVM module. - you can also directly call set_input, run, and get_output - of underlying module functions - - Parameters - ---------- - module : tvm.runtime.Module - The internal tvm module that holds the actual graph functions. - - Attributes - ---------- - module : tvm.runtime.Module - The internal tvm module that holds the actual graph functions. - - Examples - -------- - - .. code-block:: python - - import tvm - from tvm import relay - from tvm.contrib import graph_runtime - - # build the library using graph runtime - lib = relay.build(...) - lib.export_library("compiled_lib.so") - # load it back as a runtime - lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so") - # Call the library factory function for default and create - # a new runtime.Module, wrap with graph module. - gmod = graph_runtime.GraphModule(lib["default"](dev)) - # use the graph module. - gmod.set_input("x", data) - gmod.run() - """ - - def __init__(self, module): - self.module = module - self._set_input = module["set_input"] - self._run = module["run"] - self._get_output = module["get_output"] - self._get_input = module["get_input"] - self._get_num_outputs = module["get_num_outputs"] - self._get_num_inputs = module["get_num_inputs"] - self._load_params = module["load_params"] - self._share_params = module["share_params"] - - def set_input(self, key=None, value=None, **params): - """Set inputs to the module via kwargs - - Parameters - ---------- - key : int or str - The input key - - value : the input value. - The input key - - params : dict of str to NDArray - Additional arguments - """ - if key is not None: - v = self._get_input(key) - if v is None: - raise RuntimeError("Could not find '%s' in graph's inputs" % key) - v.copyfrom(value) - - if params: - # upload big arrays first to avoid memory issue in rpc mode - keys = list(params.keys()) - keys.sort(key=lambda x: -np.prod(params[x].shape)) - for k in keys: - # TODO(zhiics) Skip the weights for submodule in a better way. - # We should use MetadataModule for initialization and remove - # params from set_input - val = self._get_input(k) - if val: - self._get_input(k).copyfrom(params[k]) - - def run(self, **input_dict): - """Run forward execution of the graph - - Parameters - ---------- - input_dict: dict of str to NDArray - List of input values to be feed to - """ - if input_dict: - self.set_input(**input_dict) - self._run() - - def get_num_outputs(self): - """Get the number of outputs from the graph - - Returns - ------- - count : int - The number of outputs. - """ - return self._get_num_outputs() - - def get_num_inputs(self): - """Get the number of inputs to the graph - - Returns - ------- - count : int - The number of inputs. - """ - return self._get_num_inputs() - - def get_input(self, index, out=None): - """Get index-th input to out - - Parameters - ---------- - index : int - The input index - - out : NDArray - The output array container - """ - if out: - self._get_input(index).copyto(out) - return out - - return self._get_input(index) - - def get_output(self, index, out=None): - """Get index-th output to out - - Parameters - ---------- - index : int - The output index - - out : NDArray - The output array container - """ - if out: - self._get_output(index, out) - return out - - return self._get_output(index) - - def debug_get_output(self, node, out): - """Run graph up to node and get the output to out - - Parameters - ---------- - node : int / str - The node index or name - - out : NDArray - The output array container - """ - raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.") - - def load_params(self, params_bytes): - """Load parameters from serialized byte array of parameter dict. - - Parameters - ---------- - params_bytes : bytearray - The serialized parameter dict. - """ - self._load_params(bytearray(params_bytes)) - - def share_params(self, other, params_bytes): - """Share parameters from pre-existing GraphRuntime instance. - - Parameters - ---------- - other: GraphRuntime - The parent GraphRuntime from which this instance should share - it's parameters. - params_bytes : bytearray - The serialized parameter dict (used only for the parameter names). - """ - self._share_params(other.module, bytearray(params_bytes)) - - def __getitem__(self, key): - """Get internal module function - - Parameters - ---------- - key : str - The key to the module. - """ - return self.module[key] +def create(*args, **kwargs): + warnings.warn( + "This function has been moved to tvm.contrib.graph_executor and will be removed " + "in the next TVM release" + ) + return graph_executor.create(*args, **kwargs) diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py index 83791e50f6d5a..5bdb578f2c16c 100644 --- a/python/tvm/driver/tvmc/compiler.py +++ b/python/tvm/driver/tvmc/compiler.py @@ -143,7 +143,7 @@ def compile_model( This function takes a union of the arguments of both frontends.load_model and compiler.compile_relay. The resulting TVM module can be executed using - the graph runtime. + the graph executor. Parameters ---------- diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py index 252647557ab5d..b4c4e75aa37ab 100644 --- a/python/tvm/driver/tvmc/runner.py +++ b/python/tvm/driver/tvmc/runner.py @@ -26,8 +26,8 @@ import numpy as np from tvm import rpc from tvm.autotvm.measure import request_remote -from tvm.contrib import graph_runtime as runtime -from tvm.contrib.debugger import debug_runtime +from tvm.contrib import graph_executor as runtime +from tvm.contrib.debugger import debug_executor from tvm.relay import load_param_dict from . import common @@ -77,7 +77,7 @@ def add_run_parser(subparsers): "--profile", action="store_true", help="generate profiling data from the runtime execution. " - "Using --profile requires the Graph Runtime Debug enabled on TVM. " + "Using --profile requires the Graph Executor Debug enabled on TVM. " "Profiling may also have an impact on inference time, " "making it take longer to be generated.", ) @@ -296,7 +296,7 @@ def run_module( repeat=1, profile=False, ): - """Run a compiled graph runtime module locally or remotely with + """Run a compiled graph executor module locally or remotely with optional input values. If input tensors are not specified explicitly, they can be filled @@ -370,7 +370,7 @@ def run_module( if profile: logger.debug("creating runtime with profiling enabled") - module = debug_runtime.create(graph, lib, dev, dump_root="./prof") + module = debug_executor.create(graph, lib, dev, dump_root="./prof") else: logger.debug("creating runtime with profiling disabled") module = runtime.create(graph, lib, dev) diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index ade63f2da9e48..a70cb96d9b135 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -25,8 +25,8 @@ from .micro_binary import MicroBinary from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError from .session import ( - create_local_graph_runtime, - create_local_debug_runtime, + create_local_graph_executor, + create_local_debug_executor, Session, SessionTerminatedError, ) diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py index 4ce80be647c1b..6768e03f4473f 100644 --- a/python/tvm/micro/model_library_format.py +++ b/python/tvm/micro/model_library_format.py @@ -24,7 +24,7 @@ import tarfile from ..contrib import utils -from ..relay.backend import graph_runtime_factory +from ..relay.backend import graph_executor_factory from ..relay import param_dict @@ -117,7 +117,7 @@ def _build_memory_map(graph_json): return memory_map -def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryModule, file_name): +def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactoryModule, file_name): """Export the build artifact in Model Library Format. This function creates a .tar archive containing the build artifacts in a standardized @@ -126,7 +126,7 @@ def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryMo Parameters ---------- - mod : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule + mod : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule The return value of tvm.relay.build, which will be exported into Model Library Format. file_name : str Path to the .tar archive to generate. diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py index 8987883abafbc..78bf033799397 100644 --- a/python/tvm/micro/session.py +++ b/python/tvm/micro/session.py @@ -22,8 +22,8 @@ from ..error import register_error from .._ffi import get_global_func -from ..contrib import graph_runtime -from ..contrib.debugger import debug_runtime +from ..contrib import graph_executor +from ..contrib.debugger import debug_executor from ..rpc import RPCSession from .transport import IoTimeoutError from .transport import TransportLogger @@ -92,7 +92,7 @@ def __init__( self.timeout_override = timeout_override self._rpc = None - self._graph_runtime = None + self._graph_executor = None def get_system_lib(self): return self._rpc.get_function("runtime.SystemLib")() @@ -192,8 +192,8 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, device): ) -def create_local_graph_runtime(graph_json_str, mod, device): - """Create a local graph runtime driving execution on the remote CPU device given. +def create_local_graph_executor(graph_json_str, mod, device): + """Create a local graph executor driving execution on the remote CPU device given. Parameters ---------- @@ -208,17 +208,17 @@ def create_local_graph_runtime(graph_json_str, mod, device): Returns ------- - tvm.contrib.GraphRuntime : - A local graph runtime instance that executes on the remote device. + tvm.contrib.GraphExecutor : + A local graph executor instance that executes on the remote device. """ device_type_id = [device.device_type, device.device_id] - fcreate = get_global_func("tvm.graph_runtime.create") - return graph_runtime.GraphModule( + fcreate = get_global_func("tvm.graph_executor.create") + return graph_executor.GraphModule( fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id) ) -def create_local_debug_runtime(graph_json_str, mod, device, dump_root=None): +def create_local_debug_executor(graph_json_str, mod, device, dump_root=None): """Create a local debug runtime driving execution on the remote CPU device given. Parameters @@ -237,12 +237,12 @@ def create_local_debug_runtime(graph_json_str, mod, device, dump_root=None): Returns ------- - tvm.contrib.GraphRuntime : - A local graph runtime instance that executes on the remote device. + tvm.contrib.GraphExecutor : + A local graph executor instance that executes on the remote device. """ device_type_id = [device.device_type, device.device_id] - fcreate = get_global_func("tvm.graph_runtime_debug.create") - return debug_runtime.GraphModuleDebug( + fcreate = get_global_func("tvm.graph_executor_debug.create") + return debug_executor.GraphModuleDebug( fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id), [device], graph_json_str, diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py index 3928f77b2607f..661d7523ad775 100644 --- a/python/tvm/relay/analysis/analysis.py +++ b/python/tvm/relay/analysis/analysis.py @@ -405,7 +405,7 @@ def search_fc_transpose(expr): def get_calibration_data(mod, data): """Get the calibration data of a given relay graph - This pass uses the graph runtime to get the calibration data of a module, which + This pass uses the graph executor to get the calibration data of a module, which includes the input and output values of each function. The returned data uses the GlobalVar of each function as a key. Users can further access the inputs and outputs by using `inputs` or `outputs` as the key. diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py similarity index 91% rename from python/tvm/relay/backend/graph_runtime_codegen.py rename to python/tvm/relay/backend/graph_executor_codegen.py index ec679aee894ca..f24bf2c2b55b3 100644 --- a/python/tvm/relay/backend/graph_runtime_codegen.py +++ b/python/tvm/relay/backend/graph_executor_codegen.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. """ -A compiler from a Relay expression to TVM's graph runtime. +A compiler from a Relay expression to TVM's graph executor. The compiler is built from a few pieces. @@ -29,9 +29,9 @@ graph langauge is composed of Node, NodeRef, InputNode, OpNode. This "little language" represents programs in TVM's graph format. -To connect to the graph runtime, we use a printer that converts our graph format +To connect to the graph executor, we use a printer that converts our graph format into TVM's JSON format. The resulting string can be loaded by -contrib.graph_runtime or any other TVM runtime compatible systems. +contrib.graph_executor or any other TVM runtime compatible systems. """ from tvm.runtime.ndarray import empty from tvm.relay import _build_module @@ -39,11 +39,11 @@ from tvm.tir import expr as _expr -class GraphRuntimeCodegen(object): +class GraphExecutorCodegen(object): """The compiler from Relay to the TVM runtime system.""" def __init__(self, mod, target): - self._mod = _build_module._GraphRuntimeCodegen() + self._mod = _build_module._GraphExecutorCodegen() self._init = self._mod["init"] self._codegen = self._mod["codegen"] self._get_graph_json = self._mod["get_graph_json"] diff --git a/python/tvm/relay/backend/graph_runtime_factory.py b/python/tvm/relay/backend/graph_executor_factory.py similarity index 88% rename from python/tvm/relay/backend/graph_runtime_factory.py rename to python/tvm/relay/backend/graph_executor_factory.py index e92ae710ca0b0..d6959d22e5c87 100644 --- a/python/tvm/relay/backend/graph_runtime_factory.py +++ b/python/tvm/relay/backend/graph_executor_factory.py @@ -14,16 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Graph runtime factory.""" +"""Graph executor factory.""" import warnings from ..._ffi.base import string_types from ..._ffi.registry import get_global_func from ...runtime import ndarray -class GraphRuntimeFactoryModule: - """Graph runtime factory module. - This is a module of graph runtime factory +class GraphExecutorFactoryModule: + """Graph executor factory module. + This is a module of graph executor factory Parameters ---------- @@ -43,7 +43,7 @@ class GraphRuntimeFactoryModule: def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params): assert isinstance(graph_json_str, string_types) - fcreate = get_global_func("tvm.graph_runtime_factory.create") + fcreate = get_global_func("tvm.graph_executor_factory.create") args = [] for k, v in params.items(): args.append(k) @@ -77,9 +77,9 @@ def __getitem__(self, item): def __iter__(self): warnings.warn( - "legacy graph runtime behavior of producing json / lib / params will be " + "legacy graph executor behavior of producing json / lib / params will be " "removed in the next release." - " Please see documents of tvm.contrib.graph_runtime.GraphModule for the " + " Please see documents of tvm.contrib.graph_executor.GraphModule for the " " new recommended usage.", DeprecationWarning, 2, diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index efe495e816a22..4795a2d386857 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. """ -Construct the necessary state for the TVM graph runtime +Construct the necessary state for the TVM graph executor from a Relay expression. """ import warnings @@ -27,13 +27,13 @@ from tvm.tir import expr as tvm_expr from .. import nd as _nd, autotvm, register_func from ..target import Target -from ..contrib import graph_runtime as _graph_rt +from ..contrib import graph_executor as _graph_rt from . import _build_module from . import ty as _ty from . import expr as _expr from . import function as _function from .transform import InferType -from .backend import graph_runtime_factory as _graph_runtime_factory +from .backend import graph_executor_factory as _graph_executor_factory from .backend import interpreter as _interpreter from .backend.vm import VMExecutor @@ -70,7 +70,7 @@ def _convert_param_map(params): class BuildModule(object): - """Build an IR module to run on TVM graph runtime. This class is used + """Build an IR module to run on TVM graph executor. This class is used to expose the `RelayBuildModule` APIs implemented in C++. """ @@ -110,8 +110,8 @@ def build(self, mod, target=None, target_host=None, params=None): Returns ------- - factory_module : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule - The runtime factory for the TVM graph runtime. + factory_module : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule + The runtime factory for the TVM graph executor. """ target = _update_target(target) @@ -211,7 +211,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"): # fmt: off # pylint: disable=line-too-long - """Helper function that builds a Relay function to run on TVM graph runtime. + """Helper function that builds a Relay function to run on TVM graph executor. Parameters ---------- @@ -241,7 +241,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default" Returns ------- graph_json : str - The json string that can be accepted by graph runtime. + The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. @@ -281,10 +281,10 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default" with tophub_context: bld_mod = BuildModule() graph_json, runtime_mod, params = bld_mod.build(ir_mod, target, target_host, params) - runtime_mod = _graph_runtime_factory.GraphRuntimeFactoryModule( + executor_factory = _graph_executor_factory.GraphExecutorFactoryModule( ir_mod, target, graph_json, runtime_mod, mod_name, params ) - return runtime_mod + return executor_factory def optimize(mod, target=None, params=None): @@ -392,7 +392,9 @@ def _make_executor(self, expr=None): self.mod = InferType()(self.mod) ret_type = self.mod["main"].checked_type.ret_type if _ty.is_dynamic(ret_type): - raise ValueError("Graph Runtime only supports static graphs, got output type", ret_type) + raise ValueError( + "Graph Executor only supports static graphs, got output type", ret_type + ) mod = build(self.mod, target=self.target) gmodule = _graph_rt.GraphModule(mod["default"](self.device)) @@ -444,7 +446,7 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm"): ---------- kind : str The type of executor. Avaliable options are `debug` for the - interpreter, `graph` for the graph runtime, and `vm` for the virtual + interpreter, `graph` for the graph executor, and `vm` for the virtual machine. mod : :py:class:`~tvm.IRModule` diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py index 7bb70cec5ee5f..c2546205c5719 100644 --- a/python/tvm/relay/frontend/common.py +++ b/python/tvm/relay/frontend/common.py @@ -530,13 +530,13 @@ def infer_value(input_val, params, mod=None): try: # TODO(kevinthesun): Use VM for all cases. # pylint: disable=import-outside-toplevel - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor func = _function.Function(analysis.free_vars(input_val), input_val) with tvm.transform.PassContext(opt_level=0): lib = tvm.relay.build(func, target="llvm", params=params) dev = tvm.cpu(0) - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) m.run() return m.get_output(0) except Exception: diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index fabb639845b6b..17fdbf941e08c 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -30,7 +30,7 @@ def is_arm_compute_runtime_enabled(): - """Check if the ACL graph runtime is present. + """Check if the ACL graph executor is present. Returns ------- diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index afdea97123424..a36b66c8f0dd3 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -29,7 +29,7 @@ def is_tensorrt_runtime_enabled(): - """Check if the TensorRT graph runtime is present. + """Check if the TensorRT graph executor is present. Returns ------- ret: bool diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py index 20afa1909ac96..a906a98dccd45 100644 --- a/python/tvm/relay/quantize/_calibrate.py +++ b/python/tvm/relay/quantize/_calibrate.py @@ -29,7 +29,7 @@ from .. import expr as _expr from .. import analysis as _analysis from .. import build_module as _build_module -from ...contrib import graph_runtime +from ...contrib import graph_executor from .kl_divergence import _find_scale_by_kl @@ -46,7 +46,7 @@ def _get_profile_runtime(mod): with tvm.transform.PassContext(opt_level=3): lib = _build_module.build(func, target=target) - runtime = graph_runtime.GraphModule(lib["default"](dev)) + runtime = graph_executor.GraphModule(lib["default"](dev)) return runtime diff --git a/rust/tvm-graph-rt/Cargo.toml b/rust/tvm-graph-rt/Cargo.toml index 13837f62695d4..5c492393a75e7 100644 --- a/rust/tvm-graph-rt/Cargo.toml +++ b/rust/tvm-graph-rt/Cargo.toml @@ -19,7 +19,7 @@ name = "tvm-graph-rt" version = "0.1.0" license = "Apache-2.0" -description = "A static graph runtime for TVM." +description = "A static graph executor for TVM." repository = "https://github.com/apache/tvm" readme = "README.md" keywords = ["tvm"] diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs index 0174b0cacf3ee..de2e7dddff5cd 100644 --- a/rust/tvm-graph-rt/src/graph.rs +++ b/rust/tvm-graph-rt/src/graph.rs @@ -39,7 +39,7 @@ use crate::{errors::*, Module, Storage, Tensor}; // @see `kTVMNDArrayMagic` in `ndarray.h` const _NDARRAY_MAGIC: u64 = 0xDD5E_40F0_96B4_A13F; -// @see `kTVMNDArrayListMagic` in `graph_runtime.h` +// @see `kTVMNDArrayListMagic` in `graph_executor.h` const _NDARRAY_LIST_MAGIC: u64 = 0xF7E5_8D4F_0504_9CB7; /// A TVM computation graph. diff --git a/rust/tvm/README.md b/rust/tvm/README.md index 75fabe7d9a1b0..b518f93195b7a 100644 --- a/rust/tvm/README.md +++ b/rust/tvm/README.md @@ -37,7 +37,7 @@ The Rust bindings are composed of a few crates: - The [tvm_rt](https://tvm.apache.org/docs/api/rust/tvm_rt/index.html) crate which exposes Rust bindings to the TVM runtime APIs. - The [tvm_sys] crate which provides raw bindings and linkage to the TVM C++ library. -- The [tvm_graph_rt] crate which implements a version of the TVM graph runtime in Rust vs. C++. +- The [tvm_graph_rt] crate which implements a version of the TVM graph executor in Rust vs. C++. These crates have been recently refactored and reflect a much different philosophy than previous bindings, as well as much increased support for more of the TVM API including diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py index 6c99dee22bf54..13c499b54deb5 100644 --- a/rust/tvm/examples/resnet/src/build_resnet.py +++ b/rust/tvm/examples/resnet/src/build_resnet.py @@ -29,7 +29,7 @@ from tvm import te from tvm import relay, runtime from tvm.relay import testing -from tvm.contrib import graph_runtime, cc +from tvm.contrib import graph_executor, cc from PIL import Image from tvm.contrib.download import download_testdata from mxnet.gluon.model_zoo.vision import get_model @@ -141,7 +141,7 @@ def test_build(build_dir): params = bytearray(open(osp.join(build_dir, "deploy_param.params"), "rb").read()) input_data = get_cat_image() dev = tvm.cpu() - module = graph_runtime.create(graph, lib, dev) + module = graph_executor.create(graph, lib, dev) module.load_params(params) module.run(data=input_data) out = module.get_output(0).asnumpy() @@ -151,7 +151,7 @@ def test_build(build_dir): if __name__ == "__main__": - logger.info("Compiling the model to graph runtime.") + logger.info("Compiling the model to graph executor.") build(build_dir) logger.info("Testing the model's predication on test data.") test_build(build_dir) diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs index 047b2cf220e08..81abe338bd1b9 100644 --- a/rust/tvm/src/lib.rs +++ b/rust/tvm/src/lib.rs @@ -24,7 +24,7 @@ //! One particular use case is that given optimized deep learning model artifacts, //! (compiled with TVM) which include a shared library //! `lib.so`, `graph.json` and a byte-array `param.params`, one can load them -//! in Rust idiomatically to create a TVM Graph Runtime and +//! in Rust idiomatically to create a TVM Graph Executor and //! run the model for some inputs and get the //! desired predictions *all in Rust*. //! diff --git a/rust/tvm/src/runtime/graph_rt.rs b/rust/tvm/src/runtime/graph_rt.rs index cba2875c99522..421a00386cf57 100644 --- a/rust/tvm/src/runtime/graph_rt.rs +++ b/rust/tvm/src/runtime/graph_rt.rs @@ -22,19 +22,19 @@ use std::convert::TryInto; use crate::runtime::Function; use crate::{runtime::function::Result, runtime::ByteArray, Device, Module, NDArray}; -/// An instance of the C++ graph runtime. +/// An instance of the C++ graph executor. /// /// An efficient and light weight runtime for static deep learning models. pub struct GraphRt { - /// The backing graph runtime module which exposes a set of packed functions + /// The backing graph executor module which exposes a set of packed functions /// which can be invoked by a client. /// - /// In the graph runtime module, it exposes create, load_params, set_input, get_output, and run. + /// In the graph executor module, it exposes create, load_params, set_input, get_output, and run. module: Module, } impl GraphRt { - /// Create a graph runtime directly from a runtime module. + /// Create a graph executor directly from a runtime module. pub fn from_module(module: Module, dev: Device) -> Result { let default: Box Result> = module.get_function("default", false)?.into(); @@ -44,9 +44,9 @@ impl GraphRt { }) } - /// Create a graph runtime from the deprecated graph, lib, dev triple. + /// Create a graph executor from the deprecated graph, lib, dev triple. pub fn create_from_parts(graph: &str, lib: Module, dev: Device) -> Result { - let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap(); + let runtime_create_fn = Function::get("tvm.graph_executor.create").unwrap(); let runtime_create_fn_ret = runtime_create_fn.invoke(vec![ graph.into(), @@ -55,9 +55,9 @@ impl GraphRt { // NOTE you must pass the device id in as i32 because that's what TVM expects (dev.device_id as i32).into(), ]); - let graph_runtime_module: Module = runtime_create_fn_ret?.try_into()?; + let graph_executor_module: Module = runtime_create_fn_ret?.try_into()?; Ok(Self { - module: graph_runtime_module, + module: graph_executor_module, }) } @@ -92,13 +92,13 @@ impl GraphRt { Ok(()) } - /// Extract the ith output from the graph runtime and returns it. + /// Extract the ith output from the graph executor and returns it. pub fn get_output(&mut self, i: i64) -> Result { let get_output_fn = self.module.get_function("get_output", false)?; get_output_fn.invoke(vec![i.into()])?.try_into() } - /// Extract the ith output from the graph runtime and write the results into output. + /// Extract the ith output from the graph executor and write the results into output. pub fn get_output_into(&mut self, i: i64, output: NDArray) -> Result<()> { let get_output_fn = self.module.get_function("get_output", false)?; get_output_fn.invoke(vec![i.into(), output.into()])?; diff --git a/src/relay/analysis/get_calibration_data.cc b/src/relay/analysis/get_calibration_data.cc index 70fe2a68f21e5..12bab1e38ddd3 100644 --- a/src/relay/analysis/get_calibration_data.cc +++ b/src/relay/analysis/get_calibration_data.cc @@ -36,7 +36,7 @@ namespace relay { /*! * \brief This function returns a module that will be used by - * the relay graph runtime for collecting the calibration data. + * the relay graph executor for collecting the calibration data. * To do that, we first make all inputs and outputs of each * function into the final output (i.e., the final output is a * tuple of tensors). Then, we change the compiler attribute of @@ -106,7 +106,7 @@ IRModule GetCalibrateModule(IRModule module) { } } } - // reset the attribute of functions for running graph runtime + // reset the attribute of functions for running graph executor for (const auto& pair : glob_funcs) { if (auto* fn = pair.second.as()) { auto func = GetRef(fn); diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index 08846925bede7..3995d5ab3568d 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -19,7 +19,7 @@ /*! * \file relay/backend/build_module.cc - * \brief Code generation for TVM's graph runtime. + * \brief Code generation for TVM's graph executor. */ #include #include @@ -60,7 +60,7 @@ struct BuildOutput { struct GraphCodegen { public: GraphCodegen() { - auto pf = GetPackedFunc("relay.build_module._GraphRuntimeCodegen"); + auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen"); mod = (*pf)(); } ~GraphCodegen() {} @@ -228,7 +228,7 @@ class RelayBuildModule : public runtime::ModuleNode { const char* type_key() const final { return "RelayBuildModule"; } /*! - * \brief Build relay IRModule for graph runtime + * \brief Build relay IRModule for graph executor * * \param mod Relay IRModule * \param target Target device diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index f492b70565ace..0777b19ec557a 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -262,7 +262,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> ICHECK(tuple_type) << "Expect output to be a tuple type"; ICHECK_EQ(tuple_type->fields.size(), outputs.size()); } - // Set the name to `__copy`. It will be detected in graph runtime to perform + // Set the name to `__copy`. It will be detected in graph executor to perform // data copy across devices. if (op == device_copy_op_) { readable_name_stream_.str(std::string()); diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc index e0669ae64bdb2..8098c8d512741 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc +++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc @@ -376,12 +376,12 @@ runtime::Module ACLCompiler(const ObjectRef& ref) { TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib").set_body_typed(ACLCompiler); /*! - * \brief Check whether ACL graph runtime is used. + * \brief Check whether ACL graph executor is used. * - * \return True if ACL graph runtime is enabled, False if not. + * \return True if ACL graph executor is enabled, False if not. */ inline constexpr bool IsACLRuntimeEnabled() { -#if TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB +#if TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB return true; #else return false; diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index 059dbc192a045..e121b6010ad8b 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -32,7 +32,7 @@ #include "../../utils.h" #include "../codegen_json/codegen_json.h" -#if TVM_GRAPH_RUNTIME_TENSORRT +#if TVM_GRAPH_EXECUTOR_TENSORRT #include "NvInfer.h" #endif @@ -217,15 +217,15 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) { TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler); /*! - * \brief Check whether TensorRT graph runtime is enabled. + * \brief Check whether TensorRT graph executor is enabled. * \return True if enabled, False if not. */ inline constexpr bool IsTensorRTRuntimeEnabled() { -#if TVM_GRAPH_RUNTIME_TENSORRT +#if TVM_GRAPH_EXECUTOR_TENSORRT return true; #else return false; -#endif // TVM_GRAPH_RUNTIME_TENSORRT +#endif // TVM_GRAPH_EXECUTOR_TENSORRT } /*! @@ -234,11 +234,11 @@ inline constexpr bool IsTensorRTRuntimeEnabled() { * runtime is not enabled. */ Array GetTensorRTVersion() { -#if TVM_GRAPH_RUNTIME_TENSORRT +#if TVM_GRAPH_EXECUTOR_TENSORRT return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)}; #else return {}; -#endif // TVM_GRAPH_RUNTIME_TENSORRT +#endif // TVM_GRAPH_EXECUTOR_TENSORRT } TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled") diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_executor_codegen.cc similarity index 96% rename from src/relay/backend/graph_runtime_codegen.cc rename to src/relay/backend/graph_executor_codegen.cc index 7ed1504951048..72989b5ba46aa 100644 --- a/src/relay/backend/graph_runtime_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -19,7 +19,7 @@ /*! * \file relay/backend/graph_codegen.cc - * \brief Graph runtime codegen + * \brief Graph executor codegen */ #include @@ -181,10 +181,10 @@ class GraphOpNode : public GraphNode { const std::string op_type_name_{"tvm_op"}; }; -/*! \brief Code generator for graph runtime */ -class GraphRuntimeCodegen : public backend::MemoizedExprTranslator> { +/*! \brief Code generator for graph executor */ +class GraphExecutorCodegen : public backend::MemoizedExprTranslator> { public: - GraphRuntimeCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) { + GraphExecutorCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) { compile_engine_ = CompileEngine::Global(); targets_ = targets; } @@ -541,7 +541,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator& sptr_to_self) { if (name == "init") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -573,8 +573,8 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode { ICHECK(dev_type); targets[dev_type->value] = it.second; } - codegen_ = - std::make_shared(reinterpret_cast(mod), targets); + codegen_ = std::make_shared(reinterpret_cast(mod), + targets); }); } else if (name == "codegen") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -619,19 +619,19 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode { } } - const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; } + const char* type_key() const final { return "RelayGraphExecutorCodegenModule"; } private: - std::shared_ptr codegen_; + std::shared_ptr codegen_; LoweredOutput output_; }; runtime::Module CreateGraphCodegenMod() { - auto ptr = make_object(); + auto ptr = make_object(); return runtime::Module(ptr); } -TVM_REGISTER_GLOBAL("relay.build_module._GraphRuntimeCodegen") +TVM_REGISTER_GLOBAL("relay.build_module._GraphExecutorCodegen") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CreateGraphCodegenMod(); }); } // namespace backend diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 26bc77aa6ec53..4260f052d2c01 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -20,7 +20,7 @@ /*! * \file relay/backend/graph_plan_memory.cc * \brief Memory index assignment pass for executing - * the program in the graph runtime. + * the program in the graph executor. */ #include #include diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index 404c7efb10b08..94891c3c98ea7 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -428,7 +428,7 @@ IRModule RemoveDefaultAnnotations(IRModule module) { * could be a tuple output. Such tuple outputs needs to be flattened * otherwise the function would create tuples of tuples. Moreover, tuple * of tuples are valid relay, however they are not currently supported by - * graph runtime or relay VM. + * graph executor or relay VM. */ // New annotations would be required to be added for each flattened output diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index ed8f6adbd0831..6562d1bfc62dc 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -28,7 +28,7 @@ #include "../json/json_node.h" #include "../json/json_runtime.h" -#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB +#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB #include #include #include @@ -82,7 +82,7 @@ class ACLRuntime : public JSONRuntimeBase { BuildEngine(); } -#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB +#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB /*! * \brief Unpack inputs and outputs and run inference on a given layer. * @@ -518,12 +518,12 @@ class ACLRuntime : public JSONRuntimeBase { #else void Run() override { LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. " - << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME."; + << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR."; } void BuildEngine() { LOG(WARNING) << "Arm Compute Library engine is not initialized. " - << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME."; + << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR."; } #endif }; diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index e6eb28c10af69..21031c67863fb 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -32,7 +32,7 @@ #include "../json/json_node.h" #include "../json/json_runtime.h" -#ifdef TVM_GRAPH_RUNTIME_TENSORRT +#ifdef TVM_GRAPH_EXECUTOR_TENSORRT #include "NvInfer.h" #include "tensorrt_builder.h" #endif @@ -108,7 +108,7 @@ class TensorRTRuntime : public JSONRuntimeBase { } } -#ifdef TVM_GRAPH_RUNTIME_TENSORRT +#ifdef TVM_GRAPH_EXECUTOR_TENSORRT /*! \brief Destroy engines and contexts. */ ~TensorRTRuntime() { for (auto& it : trt_engine_cache_) { diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile index d707d0c63b810..8d3acab1858bf 100644 --- a/src/runtime/crt/Makefile +++ b/src/runtime/crt/Makefile @@ -67,8 +67,8 @@ endef LIBS = \ src/runtime/crt/common \ - src/runtime/crt/graph_runtime \ - src/runtime/crt/graph_runtime_module \ + src/runtime/crt/graph_executor \ + src/runtime/crt/graph_executor_module \ src/runtime/crt/memory \ src/runtime/crt/utvm_rpc_common \ src/runtime/crt/utvm_rpc_server diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 6634cfa7db89c..e7fa7bcb5d5e2 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_executor/graph_executor.c similarity index 76% rename from src/runtime/crt/graph_runtime/graph_runtime.c rename to src/runtime/crt/graph_executor/graph_executor.c index f0a1eb2da870b..2fe9e73aeddcd 100644 --- a/src/runtime/crt/graph_runtime/graph_runtime.c +++ b/src/runtime/crt/graph_executor/graph_executor.c @@ -20,12 +20,12 @@ // LINT_C_FILE /*! - * \file graph_runtime.c - * \brief implement graph runtime in pure C + * \file graph_executor.c + * \brief implement graph executor in pure C */ #include -#include +#include #include #include #include @@ -49,7 +49,7 @@ uint32_t Shape_Accumulate(int64_t* shape, uint32_t ndim) { return accum; } -int NodeEntry_Load(TVMGraphRuntimeNodeEntry* entry, JSONReader* reader) { +int NodeEntry_Load(TVMGraphExecutorNodeEntry* entry, JSONReader* reader) { int status = 0; reader->BeginArray(reader); if (!(reader->NextArrayItem(reader))) { @@ -74,8 +74,8 @@ int NodeEntry_Load(TVMGraphRuntimeNodeEntry* entry, JSONReader* reader) { return status; } -void TVMGraphRuntimeNode_LoadAttrs(TVMGraphRuntimeNode* node, JSONReader* reader, - TVMOpParam* param) { +void TVMGraphExecutorNode_LoadAttrs(TVMGraphExecutorNode* node, JSONReader* reader, + TVMOpParam* param) { int bitmask = 0; char key[20], value[120]; memset(param, 0, sizeof(TVMOpParam)); @@ -109,7 +109,7 @@ void TVMGraphRuntimeNode_LoadAttrs(TVMGraphRuntimeNode* node, JSONReader* reader } } -int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { +int TVMGraphExecutorNode_Load(TVMGraphExecutorNode* node, JSONReader* reader) { int status = 0; reader->BeginObject(reader); int bitmask = 0; @@ -138,8 +138,8 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { break; } DLDevice dev = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_inputs, - dev, (void**)&node->inputs); + tvm_crt_error_t err = TVMPlatformMemoryAllocate( + sizeof(TVMGraphExecutorNodeEntry) * num_inputs, dev, (void**)&node->inputs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; @@ -150,7 +150,7 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { return -1; } - TVMGraphRuntimeNodeEntry* inputs = node->inputs + count; + TVMGraphExecutorNodeEntry* inputs = node->inputs + count; reader->BeginArray(reader); if (!reader->NextArrayItem(reader)) { fprintf(stderr, "invalid json format\n"); @@ -181,7 +181,7 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { } else if (!strcmp(key, "attr") || !strcmp(key, "attrs")) { TVMOpParam param; - TVMGraphRuntimeNode_LoadAttrs(node, reader, ¶m); + TVMGraphExecutorNode_LoadAttrs(node, reader, ¶m); memcpy(&node->param, ¶m, sizeof(param)); } else if (!strcmp(key, "control_deps")) { fprintf(stderr, "do not support key %s", key); @@ -201,15 +201,15 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) { return status; } -TVMGraphRuntimeNode TVMGraphRuntimeNodeCreate() { - TVMGraphRuntimeNode node; - memset(&node, 0, sizeof(TVMGraphRuntimeNode)); - node.LoadAttrs = TVMGraphRuntimeNode_LoadAttrs; - node.Load = TVMGraphRuntimeNode_Load; +TVMGraphExecutorNode TVMGraphExecutorNodeCreate() { + TVMGraphExecutorNode node; + memset(&node, 0, sizeof(TVMGraphExecutorNode)); + node.LoadAttrs = TVMGraphExecutorNode_LoadAttrs; + node.Load = TVMGraphExecutorNode_Load; return node; } -int TVMGraphRuntimeNodeRelease(TVMGraphRuntimeNode* node) { +int TVMGraphExecutorNodeRelease(TVMGraphExecutorNode* node) { if (!node) { return 0; } @@ -225,7 +225,7 @@ int TVMGraphRuntimeNodeRelease(TVMGraphRuntimeNode* node) { return 0; } -int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* reader) { +int TVMGraphExecutorGraphAttr_Load(TVMGraphExecutorGraphAttr* attr, JSONReader* reader) { int status = 0; int bitmask = 0; char key[16], type[16]; @@ -520,7 +520,7 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re return status; } -int TVMGraphRuntimeGraphAttr_Release(TVMGraphRuntimeGraphAttr* attr) { +int TVMGraphExecutorGraphAttr_Release(TVMGraphExecutorGraphAttr* attr) { if (!attr) { return 0; } @@ -568,7 +568,7 @@ int TVMGraphRuntimeGraphAttr_Release(TVMGraphRuntimeGraphAttr* attr) { return 0; } -int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { +int TVMGraphExecutor_Load(TVMGraphExecutor* executor, JSONReader* reader) { int status = 0; reader->BeginObject(reader); int bitmask = 0; @@ -583,30 +583,30 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { break; } DLDevice dev = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNode) * num_items, dev, - (void**)&runtime->nodes); + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorNode) * num_items, dev, + (void**)&executor->nodes); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; break; } while (reader->NextArrayItem(reader)) { - if (runtime->nodes_count == num_items) { + if (executor->nodes_count == num_items) { fprintf(stderr, "array too big\n"); status = -1; return status; } - TVMGraphRuntimeNode* node = runtime->nodes + runtime->nodes_count; - status = TVMGraphRuntimeNode_Load(node, reader); + TVMGraphExecutorNode* node = executor->nodes + executor->nodes_count; + status = TVMGraphExecutorNode_Load(node, reader); if (status != 0) { - fprintf(stderr, "failed to load an element in `nodes` field in graph runtime node.\n"); + fprintf(stderr, "failed to load an element in `nodes` field in graph executor node.\n"); break; #if TVM_CRT_DEBUG } else { - printf("loading: node (%u) %s loaded.\n", runtime->nodes_count, node->name); + printf("loading: node (%u) %s loaded.\n", executor->nodes_count, node->name); #endif // TVM_CRT_DEBUG } - runtime->nodes_count++; + executor->nodes_count++; } bitmask |= 1; } else if (!strcmp(key, "arg_nodes")) { @@ -619,21 +619,22 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { } DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, - (void**)&runtime->input_nodes); + (void**)&executor->input_nodes); + if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; break; } while (reader->NextArrayItem(reader)) { - if (runtime->input_nodes_count == num_items) { + if (executor->input_nodes_count == num_items) { fprintf(stderr, "array too big\n"); status = -1; return status; } - uint32_t* node = runtime->input_nodes + runtime->input_nodes_count; + uint32_t* node = executor->input_nodes + executor->input_nodes_count; reader->ReadUnsignedInteger(reader, node); - runtime->input_nodes_count++; + executor->input_nodes_count++; } bitmask |= 2; } else if (!strcmp(key, "node_row_ptr")) { @@ -646,22 +647,22 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { } DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, - (void**)&runtime->node_row_ptr); + (void**)&executor->node_row_ptr); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; break; } while (reader->NextArrayItem(reader)) { - if (runtime->node_row_ptr_count == num_items) { + if (executor->node_row_ptr_count == num_items) { fprintf(stderr, "array too big\n"); status = -1; return status; } - uint32_t count = runtime->node_row_ptr_count; - uint32_t* node = runtime->node_row_ptr + count; + uint32_t count = executor->node_row_ptr_count; + uint32_t* node = executor->node_row_ptr + count; reader->ReadUnsignedInteger(reader, node); - runtime->node_row_ptr_count++; + executor->node_row_ptr_count++; } bitmask |= 4; } else if (!strcmp(key, "heads")) { @@ -673,32 +674,32 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { break; } DLDevice dev = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_items, - dev, (void**)&runtime->outputs); + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorNodeEntry) * num_items, + dev, (void**)&executor->outputs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; break; } while (reader->NextArrayItem(reader)) { - if (runtime->outputs_count == num_items) { + if (executor->outputs_count == num_items) { fprintf(stderr, "array too big\n"); status = -1; return status; } - TVMGraphRuntimeNodeEntry* entry = runtime->outputs + runtime->outputs_count; + TVMGraphExecutorNodeEntry* entry = executor->outputs + executor->outputs_count; status = NodeEntry_Load(entry, reader); if (status != 0) { - fprintf(stderr, "Fail to load an element in `heads` field in graph runtime node.\n"); + fprintf(stderr, "Fail to load an element in `heads` field in graph executor node.\n"); break; } - runtime->outputs_count++; + executor->outputs_count++; } bitmask |= 8; } else if (!strcmp(key, "attrs")) { - status = TVMGraphRuntimeGraphAttr_Load(&(runtime->attrs), reader); + status = TVMGraphExecutorGraphAttr_Load(&(executor->attrs), reader); if (status != 0) { - fprintf(stderr, "Fail to load an element in `heads` field in graph runtime node.\n"); + fprintf(stderr, "Fail to load an element in `heads` field in graph executor node.\n"); break; } bitmask |= 16; @@ -719,29 +720,31 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) { return status; } -uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index) { - return runtime->node_row_ptr[nid] + index; +uint32_t TVMGraphExecutor_GetEntryId(TVMGraphExecutor* executor, uint32_t nid, uint32_t index) { + return executor->node_row_ptr[nid] + index; } /*! * \brief Get the number of input tensors allocated. - * \param runtime The graph runtime. + * \param executor The graph executor. * \return the number of input tensors allocated. */ -int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) { return runtime->input_nodes_count; } +int TVMGraphExecutor_GetNumInputs(TVMGraphExecutor* executor) { + return executor->input_nodes_count; +} /*! * \brief Get the input index given the name of input. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param name The name of the input. * \return The index of input. */ -int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name) { +int TVMGraphExecutor_GetInputIndex(TVMGraphExecutor* executor, const char* name) { uint32_t i; int32_t rv = -1; - for (i = 0; i < runtime->input_nodes_count; ++i) { - uint32_t nid = runtime->input_nodes[i]; - if (!strcmp(runtime->nodes[nid].name, name)) { + for (i = 0; i < executor->input_nodes_count; ++i) { + uint32_t nid = executor->input_nodes[i]; + if (!strcmp(executor->nodes[nid].name, name)) { rv = i; break; } @@ -752,28 +755,28 @@ int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name) { /*! * \brief set input to the graph based on name. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param name The name of the input. * \param data_in The input data. */ -void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in) { - uint32_t index = TVMGraphRuntime_GetInputIndex(runtime, name); - if (index >= runtime->input_nodes_count) { +void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in) { + uint32_t index = TVMGraphExecutor_GetInputIndex(executor, name); + if (index >= executor->input_nodes_count) { fprintf(stderr, "given index is greater than num of input nodes.\n"); } - uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, runtime->input_nodes[index], 0); - runtime->data_entry[eid].dl_tensor.data = data_in->data; + uint32_t eid = TVMGraphExecutor_GetEntryId(executor, executor->input_nodes[index], 0); + executor->data_entry[eid].dl_tensor.data = data_in->data; } /*! * \brief Load parameters from parameter blob. - * \param runtime The graph runtime. + * \param executor The graph executor. * \param param_blob A binary blob of parameter. * \param param_size The parameter size. * \return The result of this function execution. */ -int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, - const uint32_t param_size) { +int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob, + const uint32_t param_size) { int status = 0; const char* bptr = param_blob; uint64_t header, reserved; @@ -790,13 +793,13 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, char* names = NULL; DLDevice dev = {kDLCPU, 0}; tvm_crt_error_t err = - TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * runtime->nodes_count, dev, (void**)&names); + TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * executor->nodes_count, dev, (void**)&names); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; return status; } - memset(names, 0, TVM_CRT_STRLEN_NAME * runtime->nodes_count); + memset(names, 0, TVM_CRT_STRLEN_NAME * executor->nodes_count); uint64_t names_count; int idx; memcpy(&names_count, bptr, sizeof(names_count)); @@ -824,33 +827,33 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, } for (idx = 0; idx < size; idx++) { - int32_t in_idx = TVMGraphRuntime_GetInputIndex(runtime, names + TVM_CRT_STRLEN_NAME * idx); + int32_t in_idx = TVMGraphExecutor_GetInputIndex(executor, names + TVM_CRT_STRLEN_NAME * idx); CHECK_GT(in_idx, 0, "Found param for non-existent input: %s\n", names + TVM_CRT_STRLEN_NAME * idx); - uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, runtime->input_nodes[in_idx], 0); - if (!(eid < runtime->data_entry_count)) { + uint32_t eid = TVMGraphExecutor_GetEntryId(executor, executor->input_nodes[in_idx], 0); + if (!(eid < executor->data_entry_count)) { fprintf(stderr, "`entry_id`=%d is greater than expected(%d).\n", eid, - runtime->data_entry_count); + executor->data_entry_count); status = -1; } - if (runtime->data_entry[eid].dl_tensor.shape) { - err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.shape, dev); + if (executor->data_entry[eid].dl_tensor.shape) { + err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.shape, dev); if (err != kTvmErrorNoError) { status = -1; } - runtime->data_entry[eid].dl_tensor.shape = 0; + executor->data_entry[eid].dl_tensor.shape = 0; } - if (runtime->data_entry[eid].dl_tensor.data) { - err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.data, dev); + if (executor->data_entry[eid].dl_tensor.data) { + err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.data, dev); if (err != kTvmErrorNoError) { status = -1; } - runtime->data_entry[eid].dl_tensor.data = 0; + executor->data_entry[eid].dl_tensor.data = 0; } - status |= TVMNDArray_Load(&(runtime->data_entry[eid]), &bptr); + status |= TVMNDArray_Load(&(executor->data_entry[eid]), &bptr); #if TVM_CRT_DEBUG - TVMNDArray* entry = &(runtime->data_entry[eid]); + TVMNDArray* entry = &(executor->data_entry[eid]); printf("loading: param %s loaded, in_idx=%d, eid=%d, ndim=%d, data[0]=%f\n", names + TVM_CRT_STRLEN_NAME * idx, in_idx, eid, entry->dl_tensor.ndim, ((float*)entry->dl_tensor.data)[0]); // NOLINT(*) @@ -869,38 +872,38 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, /*! * \brief Run all the operations one by one. - * \param runtime The graph runtime. + * \param executor The graph executor. */ -void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) { +void TVMGraphExecutor_Run(TVMGraphExecutor* executor) { // setup the array and requirements. uint32_t idx; - for (idx = 0; idx < runtime->op_execs_count; ++idx) { - if (runtime->op_execs[idx].fexec) { + for (idx = 0; idx < executor->op_execs_count; ++idx) { + if (executor->op_execs[idx].fexec) { #if TVM_CRT_DEBUG - printf("calling: %s (%d)\n", runtime->op_execs[idx].name, idx); + printf("calling: %s (%d)\n", executor->op_execs[idx].name, idx); #endif // TVM_CRT_DEBUG - runtime->op_execs[idx].Call(&(runtime->op_execs[idx])); + executor->op_execs[idx].Call(&(executor->op_execs[idx])); } } } /*! * \brief Get the number of output tensors allocated. - * \param runtime The graph runtime. + * \param executor The graph executor. * \return the number of output tensors allocated. */ -int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) { return runtime->outputs_count; } +int TVMGraphExecutor_GetNumOutputs(TVMGraphExecutor* executor) { return executor->outputs_count; } -int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) { +int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t idx, DLTensor* out) { int status = 0; - uint32_t nid = runtime->outputs[idx].node_id; - uint32_t index = runtime->outputs[idx].index; - uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, nid, index); + uint32_t nid = executor->outputs[idx].node_id; + uint32_t index = executor->outputs[idx].index; + uint32_t eid = TVMGraphExecutor_GetEntryId(executor, nid, index); // copy data section to allocated output tensor int32_t elem_bytes = out->dtype.bits / 8; int64_t size = Shape_Accumulate(out->shape, out->ndim); - DLTensor* tensor = &(runtime->data_entry[eid].dl_tensor); + DLTensor* tensor = &(executor->data_entry[eid].dl_tensor); CHECK(out->ndim == tensor->ndim); CHECK(out->dtype.bits == tensor->dtype.bits); CHECK(Shape_Accumulate(out->shape, out->ndim) == Shape_Accumulate(tensor->shape, tensor->ndim)); @@ -908,7 +911,7 @@ int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTen return status; } -int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { +int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) { TVMPackedFunc lookup_linked_param; int lookup_linked_param_valid; uint32_t idx; @@ -919,12 +922,12 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { temp_args.tcodes[0] = kTVMArgInt; temp_args.values_count = 1; lookup_linked_param_valid = - (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle, + (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, executor->module_handle, "_lookup_linked_param", &temp_args) == 0); } // Grab saved optimization plan from graph. - TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs); + TVMGraphExecutorGraphAttr* attrs = &(executor->attrs); DLDataType* vtype = NULL; DLDevice alloc_dev = {kDLCPU, 0}; tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(DLDataType) * attrs->dltype_count, @@ -938,20 +941,20 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } // Size and device type of each storage pool entry. - TVMGraphRuntimePoolEntry* pool_entry = NULL; - err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimePoolEntry) * runtime->nodes_count, + TVMGraphExecutorPoolEntry* pool_entry = NULL; + err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorPoolEntry) * executor->nodes_count, alloc_dev, (void**)&pool_entry); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; } - memset(pool_entry, 0, sizeof(TVMGraphRuntimePoolEntry) * runtime->nodes_count); + memset(pool_entry, 0, sizeof(TVMGraphExecutorPoolEntry) * executor->nodes_count); uint32_t pool_entry_count = 0; // Find the maximum space size. for (idx = 0; idx < attrs->shape_count; idx++) { int storage_id = attrs->storage_id[idx]; // Use the fallback device if no device index is available. - int device_type = runtime->devices[0].device_type; + int device_type = executor->devices[0].device_type; uint32_t size = Shape_Accumulate(attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx]); DLDataType t = vtype[idx]; uint32_t bits = t.bits * t.lanes; @@ -967,15 +970,15 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { } // Allocate the space. - err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeStorageEntry) * pool_entry_count, alloc_dev, - (void**)&runtime->storage_pool); + err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorStorageEntry) * pool_entry_count, + alloc_dev, (void**)&executor->storage_pool); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; } for (idx = 0; idx < pool_entry_count; idx++) { - TVMGraphRuntimePoolEntry pit = pool_entry[idx]; - DLDevice dev = runtime->devices[0]; + TVMGraphExecutorPoolEntry pit = pool_entry[idx]; + DLDevice dev = executor->devices[0]; uint8_t did_find_linked_param = 0; if (lookup_linked_param_valid) { lookup_linked_param.args.values[0].v_int64 = idx; @@ -983,8 +986,8 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { void* linked_param_data = lookup_linked_param.ret_value.values[0].v_handle; if (linked_param_data != NULL) { - runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1; - DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor; + executor->storage_pool[executor->storage_pool_count].is_linked_param = 1; + DLTensor* tensor = &executor->storage_pool[executor->storage_pool_count].array.dl_tensor; tensor->data = linked_param_data; tensor->device = dev; tensor->ndim = attrs->ndim[pit.entry_id]; @@ -1001,28 +1004,28 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { }; shape[0] = (pit.size + 3) / 4; int status = TVMNDArray_Empty(1, shape, dtype, dev, - &runtime->storage_pool[runtime->storage_pool_count].array); + &executor->storage_pool[executor->storage_pool_count].array); CHECK_EQ(status, 0, "fail to create storage_pool with idx=%d\n", idx); } - runtime->storage_pool_count++; + executor->storage_pool_count++; } // Assign the pooled entries. A unified memory pool is used to simplifiy // memory assignment for each node entry. The allocated memory on each device // is mapped to this pool. - runtime->data_entry_count = runtime->node_row_ptr[runtime->node_row_ptr_count - 1]; - err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * runtime->data_entry_count, alloc_dev, - (void**)&runtime->data_entry); + executor->data_entry_count = executor->node_row_ptr[executor->node_row_ptr_count - 1]; + err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * executor->data_entry_count, alloc_dev, + (void**)&executor->data_entry); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; } - for (idx = 0; idx < runtime->data_entry_count; ++idx) { + for (idx = 0; idx < executor->data_entry_count; ++idx) { uint32_t storage_id = attrs->storage_id[idx]; - CHECK(storage_id < runtime->storage_pool_count); - int status = TVMNDArray_CreateView(&(runtime->storage_pool[storage_id].array), + CHECK(storage_id < executor->storage_pool_count); + int status = TVMNDArray_CreateView(&(executor->storage_pool[storage_id].array), attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx], - vtype[idx], &runtime->data_entry[idx]); + vtype[idx], &executor->data_entry[idx]); CHECK_EQ(status, 0, "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id); } @@ -1042,32 +1045,32 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) { return 0; } -int TVMGraphRuntime_SetupOpExecs(TVMGraphRuntime* runtime) { +int TVMGraphExecutor_SetupOpExecs(TVMGraphExecutor* executor) { int status = 0; uint32_t nid, idx; - runtime->op_execs_count = runtime->nodes_count; + executor->op_execs_count = executor->nodes_count; DLDevice dev = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMPackedFunc) * runtime->op_execs_count, - dev, (void**)&runtime->op_execs); + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMPackedFunc) * executor->op_execs_count, + dev, (void**)&executor->op_execs); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); status = -1; return status; } - for (nid = 0; nid < runtime->nodes_count; nid++) { - const TVMGraphRuntimeNode* inode = runtime->nodes + nid; + for (nid = 0; nid < executor->nodes_count; nid++) { + const TVMGraphExecutorNode* inode = executor->nodes + nid; if (strcmp(inode->op_type, "null")) { DLTensorPtr args[TVM_CRT_MAX_ARGS]; uint32_t args_count = 0; for (idx = 0; idx < inode->inputs_count; idx++) { - const TVMGraphRuntimeNodeEntry* entry = inode->inputs + idx; - uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, entry->node_id, entry->index); - args[idx] = &(runtime->data_entry[eid].dl_tensor); + const TVMGraphExecutorNodeEntry* entry = inode->inputs + idx; + uint32_t eid = TVMGraphExecutor_GetEntryId(executor, entry->node_id, entry->index); + args[idx] = &(executor->data_entry[eid].dl_tensor); args_count++; } for (idx = 0; idx < inode->param.num_outputs; idx++) { - uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, nid, idx); - args[args_count] = &(runtime->data_entry[eid].dl_tensor); + uint32_t eid = TVMGraphExecutor_GetEntryId(executor, nid, idx); + args[args_count] = &(executor->data_entry[eid].dl_tensor); args_count++; } if (strcmp(inode->op_type, "tvm_op")) { @@ -1085,9 +1088,9 @@ int TVMGraphRuntime_SetupOpExecs(TVMGraphRuntime* runtime) { printf("tvm_op: creating %s with node_id=%d\n", inode->param.func_name, nid); #endif // TVM_CRT_DEBUG TVMPackedFunc pf; - TVMGraphRuntime_CreateTVMOp(runtime, &(inode->param), args, args_count, inode->inputs_count, - &pf); - runtime->op_execs[nid] = pf; + TVMGraphExecutor_CreateTVMOp(executor, &(inode->param), args, args_count, inode->inputs_count, + &pf); + executor->op_execs[nid] = pf; } } return status; @@ -1104,9 +1107,9 @@ typedef struct TVMOpArgs { uint32_t shape_data_count; } TVMOpArgs; -int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* param, - DLTensorPtr* args, const uint32_t args_count, - uint32_t num_inputs, TVMPackedFunc* pf) { +int32_t TVMGraphExecutor_CreateTVMOp(TVMGraphExecutor* executor, const TVMOpParam* param, + DLTensorPtr* args, const uint32_t args_count, + uint32_t num_inputs, TVMPackedFunc* pf) { int status = 0; uint32_t idx; TVMOpArgs arg_ptr; @@ -1137,7 +1140,7 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* } TVMArgs targs = TVMArgs_Create(arg_ptr.arg_values, arg_ptr.arg_tcodes, arg_ptr.arg_values_count); - status = TVMPackedFunc_InitModuleFunc(pf, runtime->module_handle, param->func_name, &targs); + status = TVMPackedFunc_InitModuleFunc(pf, executor->module_handle, param->func_name, &targs); return status; } @@ -1151,28 +1154,28 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* * executed on. * \return 0 on success. */ -int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, - TVMModuleHandle module_handle, const DLDevice* devs) { +int TVMGraphExecutor_Init(TVMGraphExecutor* executor, const char* graph_json, + TVMModuleHandle module_handle, const DLDevice* devs) { JSONReader reader; tvm_crt_error_t err = JSONReader_Create(graph_json, &reader); if (err != kTvmErrorNoError) { return -1; } - TVMGraphRuntime_Load(runtime, &reader); + TVMGraphExecutor_Load(executor, &reader); err = JSONReader_Release(&reader); if (err != kTvmErrorNoError) { return -1; } - runtime->module_handle = module_handle; - runtime->devices[0] = devs[0]; + executor->module_handle = module_handle; + executor->devices[0] = devs[0]; int status; - status = TVMGraphRuntime_SetupStorage(runtime); + status = TVMGraphExecutor_SetupStorage(executor); if (status != 0) { return status; } - status = TVMGraphRuntime_SetupOpExecs(runtime); + status = TVMGraphExecutor_SetupOpExecs(executor); if (status != 0) { if (status != 0) { return status; @@ -1184,74 +1187,74 @@ int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json, return status; } -int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle, - const DLDevice* devs, TVMGraphRuntime** runtime) { +int TVMGraphExecutor_Create(const char* sym_json, TVMModuleHandle module_handle, + const DLDevice* devs, TVMGraphExecutor** executor) { DLDevice dev = {kDLCPU, 0}; - tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntime), dev, (void**)runtime); + tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutor), dev, (void**)executor); if (err != kTvmErrorNoError) { fprintf(stderr, "memory allocate error: %08x", err); return -1; } - memset(*runtime, 0, sizeof(TVMGraphRuntime)); + memset(*executor, 0, sizeof(TVMGraphExecutor)); // init - return TVMGraphRuntime_Init(*runtime, sym_json, module_handle, devs); + return TVMGraphExecutor_Init(*executor, sym_json, module_handle, devs); } -int TVMGraphRuntime_Release(TVMGraphRuntime** pptr) { +int TVMGraphExecutor_Release(TVMGraphExecutor** pptr) { int status = 0; int32_t idx; - TVMGraphRuntime* runtime = (TVMGraphRuntime*)(*pptr); - for (idx = 0; idx < runtime->nodes_count; ++idx) { - status = TVMGraphRuntimeNodeRelease(&(runtime->nodes[idx])); + TVMGraphExecutor* executor = (TVMGraphExecutor*)(*pptr); + for (idx = 0; idx < executor->nodes_count; ++idx) { + status = TVMGraphExecutorNodeRelease(&(executor->nodes[idx])); if (status != 0) { return status; } } DLDevice dev = {kDLCPU, 0}; - status = TVMPlatformMemoryFree(runtime->nodes, dev); + status = TVMPlatformMemoryFree(executor->nodes, dev); if (status != 0) { return status; } - status = TVMGraphRuntimeGraphAttr_Release(&(runtime->attrs)); + status = TVMGraphExecutorGraphAttr_Release(&(executor->attrs)); if (status != 0) { return status; } - for (idx = 0; idx < runtime->storage_pool_count; ++idx) { - if (runtime->storage_pool[idx].is_linked_param == 0) { - status = TVMNDArray_Release(&(runtime->storage_pool[idx]).array); + for (idx = 0; idx < executor->storage_pool_count; ++idx) { + if (executor->storage_pool[idx].is_linked_param == 0) { + status = TVMNDArray_Release(&(executor->storage_pool[idx]).array); if (status != 0) { return status; } } } - for (idx = 0; idx < runtime->data_entry_count; ++idx) { - status = TVMPlatformMemoryFree(runtime->data_entry[idx].dl_tensor.shape, dev); + for (idx = 0; idx < executor->data_entry_count; ++idx) { + status = TVMPlatformMemoryFree(executor->data_entry[idx].dl_tensor.shape, dev); if (status != 0) { return status; } } - status = TVMPlatformMemoryFree(runtime->input_nodes, dev); + status = TVMPlatformMemoryFree(executor->input_nodes, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->node_row_ptr, dev); + status = TVMPlatformMemoryFree(executor->node_row_ptr, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->outputs, dev); + status = TVMPlatformMemoryFree(executor->outputs, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->storage_pool, dev); + status = TVMPlatformMemoryFree(executor->storage_pool, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->data_entry, dev); + status = TVMPlatformMemoryFree(executor->data_entry, dev); if (status != 0) { return status; } - status = TVMPlatformMemoryFree(runtime->op_execs, dev); + status = TVMPlatformMemoryFree(executor->op_execs, dev); if (status != 0) { return status; } diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_executor/load_json.c similarity index 99% rename from src/runtime/crt/graph_runtime/load_json.c rename to src/runtime/crt/graph_executor/load_json.c index 65dcb3eccb445..dd2faecdc5381 100644 --- a/src/runtime/crt/graph_runtime/load_json.c +++ b/src/runtime/crt/graph_executor/load_json.c @@ -25,7 +25,7 @@ */ #include #include -#include +#include #include #include diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c similarity index 51% rename from src/runtime/crt/graph_runtime_module/graph_runtime_module.c rename to src/runtime/crt/graph_executor_module/graph_executor_module.c index 4a61b89528adb..7b2a25040d082 100644 --- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c +++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c @@ -20,27 +20,27 @@ // LINT_C_FILE /*! - * \file graph_runtime_module.c - * \brief wrap graph_runtime into a TVMModule for use with RPC. + * \file graph_executor_module.c + * \brief wrap graph_executor into a TVMModule for use with RPC. */ #include -#include -#include +#include +#include #include -#include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h" +#include "tvm/runtime/crt/internal/graph_executor/graph_executor.h" typedef struct { TVMModule mod; - TVMGraphRuntime* runtime; -} GraphRuntimeModule; + TVMGraphExecutor* executor; +} GraphExecutorModule; -static GraphRuntimeModule graph_runtime; +static GraphExecutorModule graph_executor; -int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, - int* ret_tcodes, void* resource_handle) { - if (graph_runtime.runtime != NULL) { +int32_t TVMGraphExecutorModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, + int* ret_tcodes, void* resource_handle) { + if (graph_executor.executor != NULL) { return kTvmErrorGraphModuleAlreadyCreated; } @@ -59,16 +59,16 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM DLDevice dev = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64}; int ret_value = - TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &dev, &graph_runtime.runtime); + TVMGraphExecutor_Create(args[0].v_str, args[1].v_handle, &dev, &graph_executor.executor); if (ret_value != 0) { return ret_value; } TVMModuleHandle out; - ret_value = TVMModCreateFromCModule(&graph_runtime.mod, &out); + ret_value = TVMModCreateFromCModule(&graph_executor.mod, &out); if (ret_value != 0) { ret_tcodes[0] = kTVMNullptr; - TVMGraphRuntime_Release(&graph_runtime.runtime); + TVMGraphExecutor_Release(&graph_executor.executor); return ret_value; } @@ -77,8 +77,9 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM return kTvmErrorNoError; } -int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, - int* ret_tcodes, void* resource_handle) { +int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 1) { return kTvmErrorFunctionCallNumArguments; } @@ -87,45 +88,45 @@ int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, T return kTvmErrorFunctionCallWrongArgType; } - int index = TVMGraphRuntime_GetInputIndex(graph_runtime.runtime, args[0].v_str); + int index = TVMGraphExecutor_GetInputIndex(graph_executor.executor, args[0].v_str); if (index < 0) { return kTvmErrorGraphModuleNoSuchInput; } - uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, - graph_runtime.runtime->input_nodes[index], 0); - ret_values[0].v_handle = (void*)&graph_runtime.runtime->data_entry[eid].dl_tensor; + uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, + graph_executor.executor->input_nodes[index], 0); + ret_values[0].v_handle = (void*)&graph_executor.executor->data_entry[eid].dl_tensor; ret_tcodes[0] = kTVMNDArrayHandle; return 0; } -int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs, - TVMValue* ret_values, int* ret_tcodes, - void* resource_handle) { +int32_t TVMGraphExecutorModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 0) { return kTvmErrorFunctionCallNumArguments; } - ret_values[0].v_int64 = TVMGraphRuntime_GetNumInputs(); + ret_values[0].v_int64 = TVMGraphExecutor_GetNumInputs(); ret_tcodes[0] = kTVMArgInt; return 0; } -int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs, - TVMValue* ret_values, int* ret_tcodes, - void* resource_handle) { +int32_t TVMGraphExecutorModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 0) { return kTvmErrorFunctionCallNumArguments; } - ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime); + ret_values[0].v_int64 = TVMGraphExecutor_GetNumOutputs(graph_executor.executor); ret_tcodes[0] = kTVMArgInt; return 0; } -int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs, - TVMValue* ret_values, int* ret_tcodes, - void* resource_handle) { +int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 1) { return kTvmErrorFunctionCallNumArguments; } @@ -135,22 +136,22 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs, } int output_index = args[0].v_int64; - if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime)) { + if (output_index < 0 || output_index > TVMGraphExecutor_GetNumOutputs(graph_executor.executor)) { return kTvmErrorGraphModuleNoSuchInput; } - uint32_t nid = graph_runtime.runtime->outputs[output_index].node_id; - uint32_t index = graph_runtime.runtime->outputs[output_index].index; - uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index); + uint32_t nid = graph_executor.executor->outputs[output_index].node_id; + uint32_t index = graph_executor.executor->outputs[output_index].index; + uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, nid, index); - ret_values[0].v_handle = (void*)&(graph_runtime.runtime->data_entry[eid].dl_tensor); + ret_values[0].v_handle = (void*)&(graph_executor.executor->data_entry[eid].dl_tensor); ret_tcodes[0] = kTVMNDArrayHandle; return 0; } -int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs, - TVMValue* ret_values, int* ret_tcodes, - void* resource_handle) { +int32_t TVMGraphExecutorModule_LoadParams(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 1) { return kTvmErrorFunctionCallNumArguments; } @@ -162,23 +163,24 @@ int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs, ret_tcodes[0] = kTVMNullptr; TVMByteArray* arr = (TVMByteArray*)args[0].v_handle; - return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size); + return TVMGraphExecutor_LoadParams(graph_executor.executor, arr->data, arr->size); } -int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, - int* ret_tcodes, void* resource_handle) { +int32_t TVMGraphExecutorModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, + int* ret_tcodes, void* resource_handle) { if (nargs != 0) { return kTvmErrorFunctionCallNumArguments; } - TVMGraphRuntime_Run(graph_runtime.runtime); + TVMGraphExecutor_Run(graph_executor.executor); ret_tcodes[0] = kTVMNullptr; return 0; } -int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values, - int* ret_tcodes, void* resource_handle) { +int32_t TVMGraphExecutorModule_SetInput(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { if (nargs != 2) { return kTvmErrorFunctionCallNumArguments; } @@ -187,26 +189,26 @@ int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, T return kTvmErrorFunctionCallWrongArgType; } - TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*)args[1].v_handle); + TVMGraphExecutor_SetInput(graph_executor.executor, args[0].v_str, (DLTensor*)args[1].v_handle); ret_tcodes[0] = kTVMNullptr; return 0; } -int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs, - TVMValue* ret_values, int* ret_tcodes, - void* resource_handle) { +int32_t TVMGraphExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nargs, + TVMValue* ret_values, int* ret_tcodes, + void* resource_handle) { return kTvmErrorFunctionCallNotImplemented; } -static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = { - &TVMGraphRuntimeModule_GetInput, &TVMGraphRuntimeModule_GetNumInputs, - &TVMGraphRuntimeModule_GetNumOutputs, &TVMGraphRuntimeModule_GetOutput, - &TVMGraphRuntimeModule_LoadParams, &TVMGraphRuntimeModule_Run, - &TVMGraphRuntimeModule_SetInput, &TVMGraphRuntimeModule_NotImplemented, +static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = { + &TVMGraphExecutorModule_GetInput, &TVMGraphExecutorModule_GetNumInputs, + &TVMGraphExecutorModule_GetNumOutputs, &TVMGraphExecutorModule_GetOutput, + &TVMGraphExecutorModule_LoadParams, &TVMGraphExecutorModule_Run, + &TVMGraphExecutorModule_SetInput, &TVMGraphExecutorModule_NotImplemented, }; -static const TVMFuncRegistry graph_runtime_registry = { +static const TVMFuncRegistry graph_executor_registry = { "\x08get_input\0" "get_num_inputs\0" "get_num_outputs\0" @@ -215,11 +217,11 @@ static const TVMFuncRegistry graph_runtime_registry = { "run\0" "set_input\0" "share_params\0", - graph_runtime_registry_funcs}; + graph_executor_registry_funcs}; -tvm_crt_error_t TVMGraphRuntimeModule_Register() { - graph_runtime.mod.registry = &graph_runtime_registry; - graph_runtime.runtime = NULL; +tvm_crt_error_t TVMGraphExecutorModule_Register() { + graph_executor.mod.registry = &graph_executor_registry; + graph_executor.executor = NULL; - return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0); + return TVMFuncRegisterGlobal("tvm.graph_executor.create", &TVMGraphExecutorModule_Create, 0); } diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc index 15e696b59f46a..e644554179289 100644 --- a/src/runtime/crt/host/main.cc +++ b/src/runtime/crt/host/main.cc @@ -34,8 +34,8 @@ #include "crt_config.h" -#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE -#include +#ifdef TVM_HOST_USE_GRAPH_EXECUTOR_MODULE +#include #endif using namespace std::chrono; @@ -131,9 +131,9 @@ int main(int argc, char** argv) { utvm_rpc_server_t rpc_server = UTvmRpcServerInit(&UTvmWriteFunc, nullptr); -#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE - CHECK_EQ(TVMGraphRuntimeModule_Register(), kTvmErrorNoError, - "failed to register GraphRuntime TVMModule"); +#ifdef TVM_HOST_USE_GRAPH_EXECUTOR_MODULE + CHECK_EQ(TVMGraphExecutorModule_Register(), kTvmErrorNoError, + "failed to register GraphExecutor TVMModule"); #endif if (TVMFuncRegisterGlobal("tvm.testing.reset_server", (TVMFunctionHandle)&testonly_reset_server, diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h similarity index 58% rename from src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h rename to src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h index a64076ab61e00..47ef474778e06 100644 --- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h +++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h @@ -18,41 +18,41 @@ */ /*! - * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h - * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc. + * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h + * \brief Tiny graph executor that can run graph containing only tvm PackedFunc. */ -#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_ -#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_ +#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ +#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ -#include +#include #include -#include +#include #include // Memory pool entry. -typedef struct TVMGraphRuntimePoolEntry { +typedef struct TVMGraphExecutorPoolEntry { size_t size; int device_type; int entry_id; -} TVMGraphRuntimePoolEntry; +} TVMGraphExecutorPoolEntry; // Node entry -typedef struct TVMGraphRuntimeNodeEntry { +typedef struct TVMGraphExecutorNodeEntry { uint32_t node_id; uint32_t index; uint32_t version; // JSON Loader void (*Load)(JSONReader* reader); -} TVMGraphRuntimeNodeEntry; +} TVMGraphExecutorNodeEntry; // Storage entry. -typedef struct TVMGraphRuntimeStorageEntry { +typedef struct TVMGraphExecutorStorageEntry { uint8_t is_linked_param; TVMNDArray array; -} TVMGraphRuntimeStorageEntry; +} TVMGraphExecutorStorageEntry; // Node -typedef struct TVMGraphRuntimeNode { +typedef struct TVMGraphExecutorNode { // operator type in string char op_type[16]; // name of the op @@ -60,20 +60,20 @@ typedef struct TVMGraphRuntimeNode { // parameters TVMOpParam param; // inputs - TVMGraphRuntimeNodeEntry* inputs; + TVMGraphExecutorNodeEntry* inputs; // number of inputs size_t inputs_count; // control deps uint32_t control_deps[20]; // JSON Loader - void (*LoadAttrs)(struct TVMGraphRuntimeNode* node, JSONReader* reader, TVMOpParam* param); + void (*LoadAttrs)(struct TVMGraphExecutorNode* node, JSONReader* reader, TVMOpParam* param); // JSON Loader - int (*Load)(struct TVMGraphRuntimeNode* node, JSONReader* reader); -} TVMGraphRuntimeNode; + int (*Load)(struct TVMGraphExecutorNode* node, JSONReader* reader); +} TVMGraphExecutorNode; -typedef struct TVMGraphRuntime { +typedef struct TVMGraphExecutor { /*! \brief The graph nodes. */ - TVMGraphRuntimeNode* nodes; + TVMGraphExecutorNode* nodes; /*! \brief The graph nodes counter. */ uint32_t nodes_count; /*! \brief The argument nodes. */ @@ -83,18 +83,18 @@ typedef struct TVMGraphRuntime { uint32_t* node_row_ptr; uint32_t node_row_ptr_count; /*! \brief Output entries. */ - TVMGraphRuntimeNodeEntry* outputs; + TVMGraphExecutorNodeEntry* outputs; /*! \brief Output entries counter. */ uint32_t outputs_count; /*! \brief Additional graph attributes. */ - TVMGraphRuntimeGraphAttr attrs; + TVMGraphExecutorGraphAttr attrs; /*! \brief The code module that contains both host and device code. */ TVMModuleHandle module_handle; /*! \brief Execution context of all devices including the host. */ DLDevice devices[1]; uint32_t devices_count; /*! \brief Common storage pool for all devices. */ - TVMGraphRuntimeStorageEntry* storage_pool; + TVMGraphExecutorStorageEntry* storage_pool; uint32_t storage_pool_count; /*! \brief Data entry of each node. */ TVMNDArray* data_entry; @@ -102,20 +102,20 @@ typedef struct TVMGraphRuntime { /*! \brief Operator on each node. */ TVMPackedFunc* op_execs; uint32_t op_execs_count; -} TVMGraphRuntime; +} TVMGraphExecutor; typedef DLTensor* DLTensorPtr; // private functions -uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index); -void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in); -int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, - const uint32_t param_size); -void TVMGraphRuntime_Run(TVMGraphRuntime* runtime); -int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out); +uint32_t TVMGraphExecutor_GetEntryId(TVMGraphExecutor* executor, uint32_t nid, uint32_t index); +void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in); +int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob, + const uint32_t param_size); +void TVMGraphExecutor_Run(TVMGraphExecutor* executor); +int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t idx, DLTensor* out); -int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* param, - DLTensorPtr* args, const uint32_t args_count, - uint32_t num_inputs, TVMPackedFunc* pf); +int32_t TVMGraphExecutor_CreateTVMOp(TVMGraphExecutor* executor, const TVMOpParam* param, + DLTensorPtr* args, const uint32_t args_count, + uint32_t num_inputs, TVMPackedFunc* pf); -#endif // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_ +#endif // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h similarity index 90% rename from src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h rename to src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h index af69506b06349..ac5adc842b625 100644 --- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h +++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h @@ -18,11 +18,11 @@ */ /*! - * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h + * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h * \brief Lightweight JSON Reader that read save into C++ data structs. */ -#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_ -#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_ +#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_ +#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_ #include #include @@ -38,9 +38,9 @@ enum { JSON_READ_TYPE_S32 = 6, JSON_READ_TYPE_F32 = 7, JSON_READ_TYPE_F64 = 8, - JSON_READ_TYPE_GRAPH_RUNTIME_NODE = 9, - JSON_READ_TYPE_GRAPH_RUNTIME_NODE_ENTRY = 10, - JSON_READ_TYPE_GRAPH_RUNTIME_GRAPH_ATTR = 11 + JSON_READ_TYPE_GRAPH_EXECUTOR_NODE = 9, + JSON_READ_TYPE_GRAPH_EXECUTOR_NODE_ENTRY = 10, + JSON_READ_TYPE_GRAPH_EXECUTOR_GRAPH_ATTR = 11 }; typedef struct Seq { @@ -100,4 +100,4 @@ tvm_crt_error_t JSONReader_Create(const char* is, JSONReader* reader); */ tvm_crt_error_t JSONReader_Release(JSONReader* reader); -#endif // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_ +#endif // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_ diff --git a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc similarity index 76% rename from src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc rename to src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc index 3ad8453c4903b..53f225403be67 100644 --- a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc +++ b/src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc @@ -18,28 +18,28 @@ */ /*! - * \file graph_runtime_cuda_graph.cc + * \file graph_executor_cuda_graph.cc */ #include #include "../../cuda/cuda_common.h" -#include "../graph_runtime.h" +#include "../graph_executor.h" namespace tvm { namespace runtime { /*! - * \brief Graph runtime with CUDA Graph Support. + * \brief Graph executor with CUDA Graph Support. * - * This is the extension of GraphRuntime class used for CUDA graph launch + * This is the extension of GraphExecutor class used for CUDA graph launch * instead of CUDA kernel launch. CUDA graph launch requires CUDA 10.0 or * above, currently there are two ways of constructing CUDA graphs: * (1) Using CUDA stream capture API to capture a series of operations on * CUDA stream, and automatically generates a graph (2) Building a graph * using CUDA graph API manually. This implementation uses stream capture. */ -class GraphRuntimeCudaGraph : public GraphRuntime { +class GraphExecutorCudaGraph : public GraphExecutor { public: /*! * \brief Begin CUDA graph capture on stream, the stream enters capture mode. @@ -93,8 +93,8 @@ class GraphRuntimeCudaGraph : public GraphRuntime { cudaGraphExec_t cuda_graph_exec_; }; -PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name, - const ObjectPtr& sptr_to_self) { +PackedFunc GraphExecutorCudaGraph::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { if (name == "run_cuda_graph") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->RunCudaGraph(); }); @@ -104,23 +104,24 @@ PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name, } else if (name == "end_capture") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->EndCapture(); }); } else { - return GraphRuntime::GetFunction(name, sptr_to_self); + return GraphExecutor::GetFunction(name, sptr_to_self); } } -Module GraphRuntimeCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& devs, - PackedFunc lookup_linked_param_func) { - auto exec = make_object(); +Module GraphExecutorCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m, + const std::vector& devs, + PackedFunc lookup_linked_param_func) { + auto exec = make_object(); exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } -TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create") +TVM_REGISTER_GLOBAL("tvm.graph_executor_cuda_graph.create") .set_body([](TVMArgs args, TVMRetValue* rv) { - ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " - "at least 4, but it has " - << args.num_args; + ICHECK_GE(args.num_args, 4) + << "The expected number of arguments for graph_executor.create is " + "at least 4, but it has " + << args.num_args; PackedFunc lookup_linked_param_func; int dev_start_arg = 2; if (args[2].type_code() == kTVMPackedFuncHandle) { @@ -128,8 +129,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create") dev_start_arg++; } - *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), - lookup_linked_param_func); + *rv = GraphExecutorCudaGraphCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), + lookup_linked_param_func); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc similarity index 89% rename from src/runtime/graph/debug/graph_runtime_debug.cc rename to src/runtime/graph_executor/debug/graph_executor_debug.cc index fedaf4f890bcd..7c1e6960f9f53 100644 --- a/src/runtime/graph/debug/graph_runtime_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -18,7 +18,7 @@ */ /*! - * \file graph_runtime_debug.cc + * \file graph_executor_debug.cc */ #include #include @@ -29,18 +29,18 @@ #include #include -#include "../graph_runtime.h" +#include "../graph_executor.h" namespace tvm { namespace runtime { /*! - * \brief Graph runtime with debug . + * \brief Graph executor with debug . * - * This is the extension of GraphRuntime class used for debugging + * This is the extension of GraphExecutor class used for debugging * TVM runtime PackedFunc API. */ -class GraphRuntimeDebug : public GraphRuntime { +class GraphExecutorDebug : public GraphExecutor { public: /*! * \brief Run each operation in the graph and get the time per op for all ops. @@ -58,7 +58,7 @@ class GraphRuntimeDebug : public GraphRuntime { */ std::string RunIndividual(int number, int repeat, int min_repeat_ms) { // warmup run - GraphRuntime::Run(); + GraphExecutor::Run(); std::string tkey = module_->type_key(); std::vector time_sec_per_op(op_execs_.size(), 0); if (tkey == "rpc") { @@ -128,8 +128,8 @@ class GraphRuntimeDebug : public GraphRuntime { << "Don't know how to run op type " << nodes_[index].op_type << " remotely over RPC right now"; - // NOTE: GraphRuntimeDebug expects graph nodes to have an "op" attribute of "tvm_op" or "null" - // and "null" is a placeholder node for a parameter or input. + // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or + // "null" and "null" is a placeholder node for a parameter or input. return 0; } @@ -235,8 +235,8 @@ class GraphRuntimeDebug : public GraphRuntime { * \param name The function which needs to be invoked. * \param sptr_to_self Packed function pointer. */ -PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name, - const ObjectPtr& sptr_to_self) { +PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { // return member functions during query. if (name == "get_output_by_layer") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -261,26 +261,26 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name, *rv = this->RunIndividual(number, repeat, min_repeat_ms); }); } else { - return GraphRuntime::GetFunction(name, sptr_to_self); + return GraphExecutor::GetFunction(name, sptr_to_self); } } /*! - * \brief GraphRuntimeDebugCreate Get the function based on input. + * \brief GraphExecutorDebugCreate Get the function based on input. * \param sym_json The graph symbol in json format. * \param m Compiled module which will be loaded. * \param devs All devices. */ -Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& devs, - PackedFunc lookup_linked_param_func) { - auto exec = make_object(); +Module GraphExecutorDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m, + const std::vector& devs, + PackedFunc lookup_linked_param_func) { + auto exec = make_object(); exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } -TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) { - ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " +TVM_REGISTER_GLOBAL("tvm.graph_executor_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) { + ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is " "at least 4, but it has " << args.num_args; PackedFunc lookup_linked_param_func; @@ -290,8 +290,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, dev_start_arg++; } - *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), - lookup_linked_param_func); + *rv = GraphExecutorDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg), + lookup_linked_param_func); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph_executor/graph_executor.cc similarity index 88% rename from src/runtime/graph/graph_runtime.cc rename to src/runtime/graph_executor/graph_executor.cc index a4320ee3b9c03..c4d984fe9633c 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -18,9 +18,9 @@ */ /*! - * \file graph_runtime.cc + * \file graph_executor.cc */ -#include "graph_runtime.h" +#include "graph_executor.h" #include #include @@ -53,7 +53,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) { /*! * \brief Run all the operations one by one. */ -void GraphRuntime::Run() { +void GraphExecutor::Run() { // setup the array and requirements. for (size_t i = 0; i < op_execs_.size(); ++i) { if (op_execs_[i]) op_execs_[i](); @@ -68,9 +68,9 @@ void GraphRuntime::Run() { * executed on. * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr. */ -void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& devs, - const PackedFunc lookup_linked_param_func) { +void GraphExecutor::Init(const std::string& graph_json, tvm::runtime::Module module, + const std::vector& devs, + const PackedFunc lookup_linked_param_func) { std::istringstream is(graph_json); dmlc::JSONReader reader(&is); this->Load(&reader); @@ -94,7 +94,7 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu * \param name The name of the input. * \return The index of input. */ -int GraphRuntime::GetInputIndex(const std::string& name) { +int GraphExecutor::GetInputIndex(const std::string& name) { auto it = input_map_.find(name); if (it != input_map_.end()) { return it->second; @@ -106,7 +106,7 @@ int GraphRuntime::GetInputIndex(const std::string& name) { * \param index The input index. * \param data_in The input data. */ -void GraphRuntime::SetInput(int index, DLTensor* data_in) { +void GraphExecutor::SetInput(int index, DLTensor* data_in) { ICHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); data_entry_[eid].CopyFrom(data_in); @@ -116,7 +116,7 @@ void GraphRuntime::SetInput(int index, DLTensor* data_in) { * \param index The input index. * \param data_ref The input data that is referred. */ -void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) { +void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) { ICHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); const DLTensor* old_t = data_entry_[eid].operator->(); @@ -141,20 +141,20 @@ void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) { * * \return The number of outputs from graph. */ -int GraphRuntime::NumOutputs() const { return outputs_.size(); } +int GraphExecutor::NumOutputs() const { return outputs_.size(); } /*! * \brief Get the number of inputs * * \return The number of inputs to the graph. */ -int GraphRuntime::NumInputs() const { return input_nodes_.size(); } +int GraphExecutor::NumInputs() const { return input_nodes_.size(); } /*! * \brief Return NDArray for given input index. * \param index The input index. * * \return NDArray corresponding to given input node index. */ -NDArray GraphRuntime::GetInput(int index) const { +NDArray GraphExecutor::GetInput(int index) const { ICHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); return data_entry_[eid]; @@ -165,7 +165,7 @@ NDArray GraphRuntime::GetInput(int index) const { * * \return NDArray corresponding to given output node index. */ -NDArray GraphRuntime::GetOutput(int index) const { +NDArray GraphExecutor::GetOutput(int index) const { ICHECK_LT(static_cast(index), outputs_.size()); uint32_t eid = this->entry_id(outputs_[index]); return data_entry_[eid]; @@ -175,7 +175,7 @@ NDArray GraphRuntime::GetOutput(int index) const { * \param index The output index. * \param data_out the output data. */ -void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) { +void GraphExecutor::CopyOutputTo(int index, DLTensor* data_out) { ICHECK_LT(static_cast(index), outputs_.size()); uint32_t eid = this->entry_id(outputs_[index]); @@ -193,12 +193,12 @@ void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) { * \brief Load parameters from parameter blob. * \param param_blob A binary blob of parameter. */ -void GraphRuntime::LoadParams(const std::string& param_blob) { +void GraphExecutor::LoadParams(const std::string& param_blob) { dmlc::MemoryStringStream strm(const_cast(¶m_blob)); this->LoadParams(&strm); } -void GraphRuntime::LoadParams(dmlc::Stream* strm) { +void GraphExecutor::LoadParams(dmlc::Stream* strm) { Map params = ::tvm::runtime::LoadParams(strm); for (auto& p : params) { int in_idx = GetInputIndex(p.first); @@ -208,7 +208,7 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) { } } -void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) { +void GraphExecutor::ShareParams(const GraphExecutor& other, dmlc::Stream* strm) { uint64_t header, reserved; ICHECK(strm->Read(&header)) << "Invalid parameters file format"; ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format"; @@ -233,13 +233,13 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) { this->SetupOpExecs(); } -void GraphRuntime::LinkedNDArrayDeleter(Object* container) { +void GraphExecutor::LinkedNDArrayDeleter(Object* container) { // container is the NDArray::Container which needs to get deleted. // The data member points to global const memory, so it does not need deleting. delete static_cast(container); } -void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { +void GraphExecutor::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { Module mod = args[0]; int64_t storage_id = args[1]; DLTensor* template_tensor = args[2]; @@ -266,11 +266,11 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { std::unique_ptr container{new NDArray::Container( static_cast(opaque_handle), shape_vec, template_tensor->dtype, dev)}; - container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter); + container->SetDeleter(GraphExecutor::LinkedNDArrayDeleter); *rv = NDArray(GetObjectPtr(container.release())); } -void GraphRuntime::SetupStorage() { +void GraphExecutor::SetupStorage() { // Grab saved optimization plan from graph. std::vector vtype; for (const std::string& s_type : attrs_.dltype) { @@ -352,7 +352,7 @@ void GraphRuntime::SetupStorage() { } } -void GraphRuntime::SetupOpExecs() { +void GraphExecutor::SetupOpExecs() { op_execs_.resize(this->GetNumOfNodes()); input_dltensors_.resize(num_node_entries()); std::unordered_set input_node_eids; @@ -389,9 +389,10 @@ void GraphRuntime::SetupOpExecs() { } } -std::pair, std::shared_ptr > GraphRuntime::CreateTVMOp( - const TVMOpParam& param, const std::vector& args, size_t num_inputs) { - std::shared_ptr arg_ptr = std::make_shared(); +std::pair, std::shared_ptr > +GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector& args, + size_t num_inputs) { + std::shared_ptr arg_ptr = std::make_shared(); // setup address. arg_ptr->args = args; if (param.flatten_data) { @@ -438,8 +439,8 @@ std::pair, std::shared_ptr > GraphRu return {fexec, arg_ptr}; } -PackedFunc GraphRuntime::GetFunction(const std::string& name, - const ObjectPtr& sptr_to_self) { +PackedFunc GraphExecutor::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { // Return member functions during query. if (name == "set_input") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -494,20 +495,20 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name, } else if (name == "share_params") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { const auto& module = args[0].operator Module(); - ICHECK_EQ(module.operator->()->type_key(), std::string("GraphRuntime")); + ICHECK_EQ(module.operator->()->type_key(), std::string("GraphExecutor")); const auto& param_blob = args[1].operator std::string(); dmlc::MemoryStringStream strm(const_cast(¶m_blob)); - this->ShareParams(dynamic_cast(*module.operator->()), &strm); + this->ShareParams(dynamic_cast(*module.operator->()), &strm); }); } else { return PackedFunc(); } } -Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m, - const std::vector& devs, - const PackedFunc lookup_linked_param_func) { - auto exec = make_object(); +Module GraphExecutorCreate(const std::string& sym_json, const tvm::runtime::Module& m, + const std::vector& devs, + const PackedFunc lookup_linked_param_func) { + auto exec = make_object(); exec->Init(sym_json, m, devs, lookup_linked_param_func); return Module(exec); } @@ -531,8 +532,8 @@ std::vector GetAllDevice(const TVMArgs& args, int dev_start_arg) { // execution support yet. For heterogenenous execution, at least 5 arguments will // be passed in. The third one is the number of devices. // Eventually, we will only probably pass Device for all the languages. -TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRetValue* rv) { - ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " +TVM_REGISTER_GLOBAL("tvm.graph_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) { + ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is " "at least 4, but it has " << args.num_args; PackedFunc lookup_linked_param_func; @@ -542,7 +543,7 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRet dev_start_arg++; } const auto& devices = GetAllDevice(args, dev_start_arg); - *rv = GraphRuntimeCreate(args[0], args[1], devices, lookup_linked_param_func); + *rv = GraphExecutorCreate(args[0], args[1], devices, lookup_linked_param_func); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph_executor/graph_executor.h similarity index 95% rename from src/runtime/graph/graph_runtime.h rename to src/runtime/graph_executor/graph_executor.h index aeaee9e3483ed..37a47f6971e69 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -18,12 +18,12 @@ */ /*! - * \brief Tiny graph runtime that can run graph + * \brief Tiny graph executor that can run graph * containing only tvm PackedFunc. - * \file graph_runtime.h + * \file graph_executor.h */ -#ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_ -#define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_ +#ifndef TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ +#define TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ #include #include @@ -56,12 +56,12 @@ struct TVMOpParam { }; /*! - * \brief Tiny graph runtime. + * \brief Tiny graph executor. * * This runtime can be acccesibly in various language via * TVM runtime PackedFunc API. */ -class TVM_DLL GraphRuntime : public ModuleNode { +class TVM_DLL GraphExecutor : public ModuleNode { struct OpArgs { std::vector args; std::vector arg_values; @@ -81,7 +81,7 @@ class TVM_DLL GraphRuntime : public ModuleNode { /*! * \return The type key of the executor. */ - const char* type_key() const final { return "GraphRuntime"; } + const char* type_key() const final { return "GraphExecutor"; } void Run(); /*! @@ -162,12 +162,12 @@ class TVM_DLL GraphRuntime : public ModuleNode { void LoadParams(const std::string& param_blob); /*! - * \brief Share parameters from pre-existing GraphRuntime instance. - * \param other A GraphRuntime instance, previously with |LoadParams| called with the + * \brief Share parameters from pre-existing GraphExecutor instance. + * \param other A GraphExecutor instance, previously with |LoadParams| called with the * identical input |param_blob|. * \param strm The input stream. */ - void ShareParams(const GraphRuntime& other, dmlc::Stream* strm); + void ShareParams(const GraphExecutor& other, dmlc::Stream* strm); /*! * \brief Get total number of nodes. @@ -430,4 +430,4 @@ std::vector GetAllDevice(const TVMArgs& args, int dev_start_arg); } // namespace runtime } // namespace tvm -#endif // TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_ +#endif // TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_ diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph_executor/graph_executor_factory.cc similarity index 65% rename from src/runtime/graph/graph_runtime_factory.cc rename to src/runtime/graph_executor/graph_executor_factory.cc index a6f87bac8a6e1..a6cef931421b1 100644 --- a/src/runtime/graph/graph_runtime_factory.cc +++ b/src/runtime/graph_executor/graph_executor_factory.cc @@ -18,11 +18,11 @@ */ /*! - * \file graph_runtime_factory.cc - * \brief Graph runtime factory implementations + * \file graph_executor_factory.cc + * \brief Graph executor factory implementations */ -#include "./graph_runtime_factory.h" +#include "./graph_executor_factory.h" #include #include @@ -34,7 +34,7 @@ namespace tvm { namespace runtime { -GraphRuntimeFactory::GraphRuntimeFactory( +GraphExecutorFactory::GraphExecutorFactory( const std::string& graph_json, const std::unordered_map& params, const std::string& module_name) { @@ -43,7 +43,7 @@ GraphRuntimeFactory::GraphRuntimeFactory( module_name_ = module_name; } -PackedFunc GraphRuntimeFactory::GetFunction( +PackedFunc GraphExecutorFactory::GetFunction( const std::string& name, const tvm::runtime::ObjectPtr& sptr_to_self) { if (name == module_name_) { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -51,7 +51,7 @@ PackedFunc GraphRuntimeFactory::GetFunction( for (int i = 0; i < args.num_args; ++i) { devices.emplace_back(args[i].operator Device()); } - *rv = this->RuntimeCreate(devices); + *rv = this->ExecutorCreate(devices); }); } else if (name == "debug_create") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { @@ -62,13 +62,13 @@ PackedFunc GraphRuntimeFactory::GetFunction( for (int i = 1; i < args.num_args; ++i) { devices.emplace_back(args[i].operator Device()); } - *rv = this->DebugRuntimeCreate(devices); + *rv = this->DebugExecutorCreate(devices); }); } else if (name == "remove_params") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { std::unordered_map empty_params{}; auto exec = - make_object(this->graph_json_, empty_params, this->module_name_); + make_object(this->graph_json_, empty_params, this->module_name_); exec->Import(this->imports_[0]); *rv = Module(exec); }); @@ -78,14 +78,14 @@ PackedFunc GraphRuntimeFactory::GetFunction( for (int i = 0; i < args.num_args; ++i) { devices.emplace_back(args[i].operator Device()); } - *rv = this->CudaGraphRuntimeCreate(devices); + *rv = this->CudaGraphExecutorCreate(devices); }); } else { return PackedFunc(); } } -void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) { +void GraphExecutorFactory::SaveToBinary(dmlc::Stream* stream) { stream->Write(graph_json_); std::vector names; std::vector arrays; @@ -103,19 +103,19 @@ void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) { stream->Write(module_name_); } -Module GraphRuntimeFactory::RuntimeCreate(const std::vector& devs) { - auto exec = make_object(); +Module GraphExecutorFactory::ExecutorCreate(const std::vector& devs) { + auto exec = make_object(); exec->Init(this->graph_json_, this->imports_[0], devs, PackedFunc()); // set params SetParams(exec.get(), this->params_); return Module(exec); } -Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& devs) { - const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_debug.create"); - ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. " - "Do you enable debug graph runtime build?"; - // Debug runtime create packed function will call GetAllContexs, so we unpack the devs. +Module GraphExecutorFactory::DebugExecutorCreate(const std::vector& devs) { + const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_executor_debug.create"); + ICHECK(pf != nullptr) << "Cannot find function tvm.graph_executor_debug.create in registry. " + "Do you enable debug graph executor build?"; + // Debug executor create packed function will call GetAllContexs, so we unpack the devs. std::vector unpacked_devs; for (const auto& dev : devs) { unpacked_devs.emplace_back(dev.device_type); @@ -133,15 +133,15 @@ Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& devs) TVMRetValue rv; pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); Module mod = rv.operator Module(); - // debug graph runtime is one child class of graph runtime. - SetParams(const_cast(mod.as()), this->params_); + // debug graph executor is one child class of graph executor. + SetParams(const_cast(mod.as()), this->params_); return mod; } -Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& devs) { - const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_cuda_graph.create"); - ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_cuda_graph.create in registry. " - "Did you set(USE_GRAPH_RUNTIME_CUGRAPH=ON)?"; +Module GraphExecutorFactory::CudaGraphExecutorCreate(const std::vector& devs) { + const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_executor_cuda_graph.create"); + ICHECK(pf != nullptr) << "Cannot find function tvm.graph_executor_cuda_graph.create in registry. " + "Did you set(USE_GRAPH_EXECUTOR_CUGRAPH=ON)?"; std::vector unpacked_devs; for (const auto& dev : devs) { unpacked_devs.emplace_back(dev.device_type); @@ -159,11 +159,11 @@ Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& de TVMRetValue rv; pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); Module mod = rv.operator Module(); - SetParams(const_cast(mod.as()), this->params_); + SetParams(const_cast(mod.as()), this->params_); return mod; } -Module GraphRuntimeFactoryModuleLoadBinary(void* strm) { +Module GraphExecutorFactoryModuleLoadBinary(void* strm) { dmlc::Stream* stream = static_cast(strm); std::string graph_json; std::unordered_map params; @@ -180,30 +180,31 @@ Module GraphRuntimeFactoryModuleLoadBinary(void* strm) { params[names[i]] = temp; } ICHECK(stream->Read(&module_name)); - auto exec = make_object(graph_json, params, module_name); + auto exec = make_object(graph_json, params, module_name); return Module(exec); } -TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args, TVMRetValue* rv) { - ICHECK_GE(args.num_args, 3) << "The expected number of arguments for " - "graph_runtime_factory.create needs at least 3, " - "but it has " - << args.num_args; - // The argument order is graph_json, module, module_name, param0_name, param0_tensor, - // [param1_name, param1_tensor], ... - ICHECK_EQ((args.size() - 3) % 2, 0); - std::unordered_map params; - for (size_t i = 3; i < static_cast(args.size()); i += 2) { - std::string name = args[i].operator String(); - params[name] = args[i + 1].operator tvm::runtime::NDArray(); - } - auto exec = make_object(args[0], params, args[2]); - exec->Import(args[1]); - *rv = Module(exec); -}); +TVM_REGISTER_GLOBAL("tvm.graph_executor_factory.create") + .set_body([](TVMArgs args, TVMRetValue* rv) { + ICHECK_GE(args.num_args, 3) << "The expected number of arguments for " + "graph_executor_factory.create needs at least 3, " + "but it has " + << args.num_args; + // The argument order is graph_json, module, module_name, param0_name, param0_tensor, + // [param1_name, param1_tensor], ... + ICHECK_EQ((args.size() - 3) % 2, 0); + std::unordered_map params; + for (size_t i = 3; i < static_cast(args.size()); i += 2) { + std::string name = args[i].operator String(); + params[name] = args[i + 1].operator tvm::runtime::NDArray(); + } + auto exec = make_object(args[0], params, args[2]); + exec->Import(args[1]); + *rv = Module(exec); + }); -TVM_REGISTER_GLOBAL("runtime.module.loadbinary_GraphRuntimeFactory") - .set_body_typed(GraphRuntimeFactoryModuleLoadBinary); +TVM_REGISTER_GLOBAL("runtime.module.loadbinary_GraphExecutorFactory") + .set_body_typed(GraphExecutorFactoryModuleLoadBinary); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph_executor/graph_executor_factory.h similarity index 68% rename from src/runtime/graph/graph_runtime_factory.h rename to src/runtime/graph_executor/graph_executor_factory.h index 86958218a0f75..46346cbea002a 100644 --- a/src/runtime/graph/graph_runtime_factory.h +++ b/src/runtime/graph_executor/graph_executor_factory.h @@ -18,12 +18,12 @@ */ /*! - * \file tvm/runtime/graph_runtime_factory.h - * \brief Graph runtime factory creating graph runtime. + * \file tvm/runtime/graph_executor/graph_executor_factory.h + * \brief Graph executor factory creating graph executor. */ -#ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_ -#define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_ +#ifndef TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_ +#define TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_ #include #include @@ -37,22 +37,22 @@ #include #include -#include "./graph_runtime.h" +#include "./graph_executor.h" namespace tvm { namespace runtime { -class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { +class TVM_DLL GraphExecutorFactory : public runtime::ModuleNode { public: /*! - * \brief Construct the GraphRuntimeFactory. + * \brief Construct the GraphExecutorFactory. * \param graph_json The execution graph. * \param params The params of graph. * \param module_name The module name of graph. */ - GraphRuntimeFactory(const std::string& graph_json, - const std::unordered_map& params, - const std::string& module_name = "default"); + GraphExecutorFactory(const std::string& graph_json, + const std::unordered_map& params, + const std::string& module_name = "default"); /*! * \brief Get member function to front-end @@ -65,7 +65,7 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { /*! * \return The type key of the executor. */ - const char* type_key() const override { return "GraphRuntimeFactory"; } + const char* type_key() const override { return "GraphExecutorFactory"; } /*! * \brief Save the module to binary stream. @@ -74,35 +74,35 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { void SaveToBinary(dmlc::Stream* stream) override; /*! - * \brief Create a specific runtime module + * \brief Create a specific executor module * \param devs The device of the host and devices where graph nodes will be * executed on. - * \return created runtime module + * \return created executor module */ - Module RuntimeCreate(const std::vector& devs); + Module ExecutorCreate(const std::vector& devs); /*! - * \brief Create a specific debug runtime module + * \brief Create a specific debug executor module * \param devs The device of the host and devices where graph nodes will be * executed on. - * \return created debug runtime module + * \return created debug executor module */ - Module DebugRuntimeCreate(const std::vector& devs); + Module DebugExecutorCreate(const std::vector& devs); /*! - * \brief Create a specific cuda graph runtime module + * \brief Create a specific cuda graph executor module * \param devs The device of the host and devices where graph nodes will be * executed on. - * \return created cuda graph runtime module + * \return created cuda graph executor module */ - Module CudaGraphRuntimeCreate(const std::vector& devs); + Module CudaGraphExecutorCreate(const std::vector& devs); /*! * \brief Set params. - * \param graph_runtime The graph runtime we want to set the params into. + * \param graph_executor The graph executor we want to set the params into. * \param params The graph params value we want to set. */ - void SetParams(GraphRuntime* graph_runtime, + void SetParams(GraphExecutor* graph_executor, const std::unordered_map& params) const { std::unordered_map value = params; // upload big arrays first to avoid memory issue in rpc mode @@ -117,9 +117,9 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { return lhs_size > rhs_size; }); for (const auto& key : keys) { - int in_idx = graph_runtime->GetInputIndex(key); + int in_idx = graph_executor->GetInputIndex(key); if (in_idx >= 0) { - graph_runtime->SetInput(in_idx, const_cast(value[key].operator->())); + graph_executor->SetInput(in_idx, const_cast(value[key].operator->())); } } } @@ -136,4 +136,4 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { } // namespace runtime } // namespace tvm -#endif // TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_ +#endif // TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_ diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc index 665c72cc5e0d1..4a1d89ce1a1fd 100644 --- a/src/runtime/metadata_module.cc +++ b/src/runtime/metadata_module.cc @@ -21,7 +21,7 @@ * \file src/runtime/metadata_module.cc * \brief A wrapper for initializing imported modules using metadata. This * module is intended to be used by various runtime in the TVM stack, i.e. - * graph runtime, relay VM, AOT runtime, and various user defined runtimes. It + * graph executor, relay VM, AOT runtime, and various user defined runtimes. It * paves the way to separate the code and metedata, which makes compilation * and/or interpretation more convenient. In addition, the clear separation of * code and metadata significantly reduces the efforts for handling external diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.cc b/src/runtime/micro/standalone/utvm_graph_executor.cc similarity index 96% rename from src/runtime/micro/standalone/utvm_graph_runtime.cc rename to src/runtime/micro/standalone/utvm_graph_executor.cc index 897a4f4b5c0ce..920faa134cf5c 100644 --- a/src/runtime/micro/standalone/utvm_graph_runtime.cc +++ b/src/runtime/micro/standalone/utvm_graph_executor.cc @@ -17,7 +17,7 @@ * under the License. */ -#include "utvm_graph_runtime.h" +#include "utvm_graph_executor.h" #include @@ -226,7 +226,7 @@ void* DSOModule::GetSymbol(const char* name) const { return f; } -MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* module) { +MicroGraphExecutor::MicroGraphExecutor(const std::string& graph_json, DSOModule* module) { assert(module); module_ = module; picojson::value v; @@ -240,28 +240,28 @@ MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* m SetupOpExecs(); } -MicroGraphRuntime::~MicroGraphRuntime() {} +MicroGraphExecutor::~MicroGraphExecutor() {} -void MicroGraphRuntime::Run() { +void MicroGraphExecutor::Run() { for (size_t i = 0; i < op_execs_.size(); ++i) { if (op_execs_[i]) op_execs_[i](); } } -void MicroGraphRuntime::SetInput(int index, DLTensor* data_in) { +void MicroGraphExecutor::SetInput(int index, DLTensor* data_in) { assert(static_cast(index) < input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); data_entry_[eid].CopyFrom(data_in); } -void MicroGraphRuntime::CopyOutputTo(int index, DLTensor* data_out) { +void MicroGraphExecutor::CopyOutputTo(int index, DLTensor* data_out) { assert(static_cast(index) < outputs_.size()); uint32_t eid = this->entry_id(outputs_[index]); const NDArray& data = data_entry_[eid]; data.CopyTo(data_out); } -void MicroGraphRuntime::SetupStorage() { +void MicroGraphExecutor::SetupStorage() { // Grab saved optimization plan from graph. DynArray vtype(attrs_.dltype.size()); for (size_t i = 0; i < attrs_.dltype.size(); ++i) { @@ -373,7 +373,7 @@ std::function CreateTVMOp(const DSOModule& module, const TVMOpParam& par return fexec; } -void MicroGraphRuntime::SetupOpExecs() { +void MicroGraphExecutor::SetupOpExecs() { op_execs_.resize(nodes_.size()); // setup the array and requirements. for (uint32_t nid = 0; nid < nodes_.size(); ++nid) { diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.h b/src/runtime/micro/standalone/utvm_graph_executor.h similarity index 90% rename from src/runtime/micro/standalone/utvm_graph_runtime.h rename to src/runtime/micro/standalone/utvm_graph_executor.h index 3a2519c98fd4e..afede6a7b30ac 100644 --- a/src/runtime/micro/standalone/utvm_graph_runtime.h +++ b/src/runtime/micro/standalone/utvm_graph_executor.h @@ -17,8 +17,8 @@ * under the License. */ -#ifndef TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_ -#define TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_ +#ifndef TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_ +#define TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_ #include @@ -116,12 +116,12 @@ class NDArray { DLDevice device_; }; -// Minimal GraphRuntime implementation -class MicroGraphRuntime { +// Minimal GraphExecutor implementation +class MicroGraphExecutor { public: - // Construct a GraphRuntime with the given graph and DSOModule. - MicroGraphRuntime(const std::string& graph_json, DSOModule* module); - ~MicroGraphRuntime(); + // Construct a GraphExecutor with the given graph and DSOModule. + MicroGraphExecutor(const std::string& graph_json, DSOModule* module); + ~MicroGraphExecutor(); // Run the graph void Run(); // Set the input at `index` to a copy of the tensor `data_in` @@ -164,4 +164,4 @@ class MicroGraphRuntime { } // namespace micro } // namespace tvm -#endif // TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_ +#endif // TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_ diff --git a/src/runtime/micro/standalone/utvm_runtime.cc b/src/runtime/micro/standalone/utvm_runtime.cc index 73d616b6d482c..585da9300128f 100644 --- a/src/runtime/micro/standalone/utvm_runtime.cc +++ b/src/runtime/micro/standalone/utvm_runtime.cc @@ -20,28 +20,28 @@ #include -#include "utvm_graph_runtime.h" +#include "utvm_graph_executor.h" void* UTVMRuntimeCreate(const char* json, size_t json_len, void* module) { - return new tvm::micro::MicroGraphRuntime(std::string(json, json + json_len), - reinterpret_cast(module)); + return new tvm::micro::MicroGraphExecutor(std::string(json, json + json_len), + reinterpret_cast(module)); } void UTVMRuntimeDestroy(void* handle) { - delete reinterpret_cast(handle); + delete reinterpret_cast(handle); } void UTVMRuntimeSetInput(void* handle, int index, void* tensor) { - reinterpret_cast(handle)->SetInput( + reinterpret_cast(handle)->SetInput( index, reinterpret_cast(tensor)); } void UTVMRuntimeRun(void* handle) { - reinterpret_cast(handle)->Run(); + reinterpret_cast(handle)->Run(); } void UTVMRuntimeGetOutput(void* handle, int index, void* tensor) { - reinterpret_cast(handle)->CopyOutputTo( + reinterpret_cast(handle)->CopyOutputTo( index, reinterpret_cast(tensor)); } void* UTVMRuntimeDSOModuleCreate(const char* so, size_t so_len) { diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc index d6c8f17995960..ea3a22e8ab01b 100644 --- a/src/support/libinfo.cc +++ b/src/support/libinfo.cc @@ -76,12 +76,12 @@ #define TVM_INFO_USE_STACKVM_RUNTIME "NOT-FOUND" #endif -#ifndef TVM_INFO_USE_GRAPH_RUNTIME -#define TVM_INFO_USE_GRAPH_RUNTIME "NOT-FOUND" +#ifndef TVM_INFO_USE_GRAPH_EXECUTOR +#define TVM_INFO_USE_GRAPH_EXECUTOR "NOT-FOUND" #endif -#ifndef TVM_INFO_USE_GRAPH_RUNTIME_DEBUG -#define TVM_INFO_USE_GRAPH_RUNTIME_DEBUG "NOT-FOUND" +#ifndef TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG +#define TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG "NOT-FOUND" #endif #ifndef TVM_INFO_USE_OPENMP @@ -200,8 +200,8 @@ #define TVM_INFO_USE_ARM_COMPUTE_LIB "NOT-FOUND" #endif -#ifndef TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME -#define TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "NOT-FOUND" +#ifndef TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR +#define TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "NOT-FOUND" #endif #ifndef TVM_INFO_INDEX_DEFAULT_I64 @@ -234,8 +234,8 @@ TVM_DLL Map GetLibInfo() { {"USE_LLVM", TVM_INFO_USE_LLVM}, {"LLVM_VERSION", TVM_INFO_LLVM_VERSION}, {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME}, - {"USE_GRAPH_RUNTIME", TVM_INFO_USE_GRAPH_RUNTIME}, - {"USE_GRAPH_RUNTIME_DEBUG", TVM_INFO_USE_GRAPH_RUNTIME_DEBUG}, + {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR}, + {"USE_GRAPH_EXECUTOR_DEBUG", TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG}, {"USE_OPENMP", TVM_INFO_USE_OPENMP}, {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG}, {"USE_RTTI", TVM_INFO_USE_RTTI}, @@ -265,7 +265,7 @@ TVM_DLL Map GetLibInfo() { {"USE_COREML", TVM_INFO_USE_COREML}, {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX}, {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB}, - {"USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}, + {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}, {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64}, {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}}; return result; diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc index 0b30d42c876ce..8184e9189c4b4 100644 --- a/src/target/metadata_module.cc +++ b/src/target/metadata_module.cc @@ -35,7 +35,7 @@ namespace codegen { /*! * \brief Create a metadata module wrapper. The helper is used by different - * codegens, such as graph runtime codegen and the vm compiler. + * codegens, such as graph executor codegen and the vm compiler. * * \param params The metadata for initialization of all modules. * \param target_module the internal module that is compiled by tvm. diff --git a/tests/azure-pipelines/main.yml b/tests/azure-pipelines/main.yml index 094c1df12739c..49d488aba5fdb 100644 --- a/tests/azure-pipelines/main.yml +++ b/tests/azure-pipelines/main.yml @@ -35,7 +35,7 @@ jobs: cmakeArgs: > -DUSE_SORT=ON -DUSE_RPC=ON - -DUSE_GRAPH_RUNTIME=ON + -DUSE_GRAPH_EXECUTOR=ON .. - task: MSBuild@1 inputs: @@ -56,7 +56,7 @@ jobs: cmakeArgs: > -DUSE_SORT=ON -DUSE_RPC=ON - -DUSE_GRAPH_RUNTIME=ON + -DUSE_GRAPH_EXECUTOR=ON .. - task: MSBuild@1 inputs: @@ -75,7 +75,7 @@ jobs: cmakeArgs: > -DUSE_SORT=ON -DUSE_RPC=ON - -DUSE_GRAPH_RUNTIME=ON + -DUSE_GRAPH_EXECUTOR=ON .. - script: cd build.common && make -j`sysctl -n hw.ncpu` displayName: Build the project diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index ed50e3c86e855..e9373936e0d47 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -163,16 +163,16 @@ TEST(BuildModule, Heterogeneous) { pc[i] = i - 1.0; } - // Initialize graph runtime. + // Initialize graph executor. int cpu_dev_ty = static_cast(kDLCPU); int cpu_dev_id = 0; int gpu_dev_ty = static_cast(kDLGPU); int gpu_dev_id = 0; - const runtime::PackedFunc* graph_runtime = - tvm::runtime::Registry::Get("tvm.graph_runtime.create"); + const runtime::PackedFunc* graph_executor = + tvm::runtime::Registry::Get("tvm.graph_executor.create"); runtime::Module mod = - (*graph_runtime)(json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id); + (*graph_executor)(json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id); // test FFI for module. auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) { diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index 74e5289c5093b..b7b5abfd697d2 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -124,7 +124,7 @@ TEST(Relay, BuildModule) { tvm::runtime::Module mod = mod_f(); // run auto dev = A->device; - auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create"); + auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create"); ICHECK(mod.defined()) << "Module must be defined"; tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)dev.device_type, (int)dev.device_id); auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false); diff --git a/tests/micro/test_runtime_micro_on_arm.py b/tests/micro/test_runtime_micro_on_arm.py index 45ca8e74323c5..7d19d9510062b 100644 --- a/tests/micro/test_runtime_micro_on_arm.py +++ b/tests/micro/test_runtime_micro_on_arm.py @@ -19,7 +19,7 @@ import numpy as np import tvm from tvm import te -from tvm.contrib import graph_runtime, utils +from tvm.contrib import graph_executor, utils from tvm import relay import tvm.micro as micro from tvm.micro import create_micro_mod @@ -36,7 +36,7 @@ def relay_micro_build(func, dev_config, params=None): - """Create a graph runtime module with a micro device context from a Relay function. + """Create a graph executor module with a micro device context from a Relay function. Parameters ---------- @@ -52,7 +52,7 @@ def relay_micro_build(func, dev_config, params=None): Return ------ mod : tvm.runtime.Module - graph runtime module for the target device + graph executor module for the target device """ with tvm.transform.PassContext( disabled_pass={"FuseOps"}, config={"tir.disable_vectorize": True} @@ -60,7 +60,7 @@ def relay_micro_build(func, dev_config, params=None): graph, c_mod, params = relay.build(func, target=TARGET, params=params) micro_mod = micro.create_micro_mod(c_mod, dev_config) ctx = tvm.micro_dev(0) - mod = graph_runtime.create(graph, micro_mod, ctx) + mod = graph_executor.create(graph, micro_mod, ctx) mod.set_input(**params) return mod @@ -171,8 +171,8 @@ def test_workspace_add(): tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 2.0) -def test_graph_runtime(): - """Test a program which uses the graph runtime.""" +def test_graph_executor(): + """Test a program which uses the graph executor.""" if not tvm.runtime.enabled("micro_dev"): return shape = (1024,) @@ -347,9 +347,9 @@ def test_inactive_session_use(): print() print("finished workspace add test") input("[press enter to continue]") - test_graph_runtime() + test_graph_executor() print() - print("finished graph runtime test") + print("finished graph executor test") input("[press enter to continue]") test_conv2d() print() diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py index 003cd54bba90a..b4731f16d99f5 100644 --- a/tests/micro/zephyr/test_zephyr.py +++ b/tests/micro/zephyr/test_zephyr.py @@ -198,7 +198,7 @@ def test_relay(platform, west_cmd): graph, mod, params = tvm.relay.build(func, target=target) with _make_session(model, target, zephyr_board, west_cmd, mod) as session: - graph_mod = tvm.micro.create_local_graph_runtime( + graph_mod = tvm.micro.create_local_graph_executor( graph, session.get_system_lib(), session.device ) graph_mod.set_input(**params) @@ -239,7 +239,7 @@ def test_onnx(platform, west_cmd): graph = lowered.get_json() with _make_session(model, target, zephyr_board, west_cmd, lowered.lib) as session: - graph_mod = tvm.micro.create_local_graph_runtime( + graph_mod = tvm.micro.create_local_graph_executor( graph, session.get_system_lib(), session.device ) @@ -318,7 +318,7 @@ def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape graph, mod, params = tvm.relay.build(relay_mod, target=target) with _make_session(model, target, zephyr_board, west_cmd, mod) as session: - rt_mod = tvm.micro.create_local_graph_runtime( + rt_mod = tvm.micro.create_local_graph_executor( graph, session.get_system_lib(), session.device ) rt_mod.set_input(**params) diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py index 9a9bf69958f5c..35f345cea78a1 100644 --- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py +++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py @@ -24,7 +24,7 @@ import tvm from tvm import relay from tvm import rpc -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.op.contrib import arm_compute_lib from tvm.contrib import utils from tvm.autotvm.measure import request_remote @@ -214,7 +214,7 @@ def build_and_run( raise Exception(err_msg) lib = update_lib(lib, device.device, device.cross_compile) - gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0))) + gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0))) gen_module.set_input(**inputs) out = [] for _ in range(no_runs): diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py index 0107de54a04f8..d046ee9ad0dde 100644 --- a/tests/python/contrib/test_bnns/infrastructure.py +++ b/tests/python/contrib/test_bnns/infrastructure.py @@ -25,7 +25,7 @@ import tvm from tvm import relay from tvm import rpc -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.op.contrib.bnns import partition_for_bnns from tvm.contrib import utils from tvm.autotvm.measure import request_remote @@ -171,7 +171,7 @@ def build_and_run( raise Exception(err_msg) lib = update_lib(lib, device.device, device.cross_compile) - gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0))) + gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0))) gen_module.set_input(**inputs) out = [] for _ in range(no_runs): diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py index 7c8dfa6b5e9a8..25c4bc4833333 100644 --- a/tests/python/contrib/test_bnns/test_onnx_topologies.py +++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py @@ -21,7 +21,7 @@ import tvm from tvm import relay from tvm.relay import transform -from tvm.contrib import utils, graph_runtime +from tvm.contrib import utils, graph_executor from tvm.contrib.download import download_testdata from tvm.relay.op.contrib.bnns import partition_for_bnns @@ -118,7 +118,7 @@ def run(mod, target, simplify=True, with_bnns=False): dev = tvm.cpu(0) loaded_lib = tvm.runtime.load_module(path_dso) - module = graph_runtime.GraphModule(loaded_lib["default"](dev)) + module = graph_executor.GraphModule(loaded_lib["default"](dev)) module.run() return module.get_output(0).asnumpy() diff --git a/tests/python/contrib/test_coreml_codegen.py b/tests/python/contrib/test_coreml_codegen.py index f8baf9f3530d5..b93c489fdac6d 100644 --- a/tests/python/contrib/test_coreml_codegen.py +++ b/tests/python/contrib/test_coreml_codegen.py @@ -105,7 +105,7 @@ def test_compile_and_run(): with relay.build_config(opt_level=3): lib = relay.build(_create_graph_annotated(), target=target) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) + m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) shape = (10, 10) x_data = np.random.rand(*shape).astype("float32") diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index 791ef4a2a1059..59021cf86211a 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -20,7 +20,7 @@ from __future__ import absolute_import, print_function import tvm from tvm import relay -from tvm.contrib import utils, graph_runtime, download +from tvm.contrib import utils, graph_executor, download from hashlib import md5 from itertools import zip_longest, combinations import numpy as np @@ -211,7 +211,7 @@ def run(lib, inputs, outputs, npu=True): lib_path = temp.relpath(lib_name) lib.export_library(lib_path) lib = tvm.runtime.load_module(lib_path) - module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + module = graph_executor.GraphModule(lib["default"](tvm.cpu())) module.set_input(**inputs) module.run() out = [module.get_output(i) for i in range(outputs)] diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index 0e25ca24b2a60..2bef7be65938f 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -24,7 +24,7 @@ from tvm import relay, runtime from tvm.relay.op.contrib import tensorrt -from tvm.contrib import graph_runtime, utils +from tvm.contrib import graph_executor, utils from tvm.runtime.vm import VirtualMachine from tvm.relay import Any, GlobalVar, transform from tvm.relay.expr_functor import ExprVisitor @@ -252,7 +252,7 @@ def test_tensorrt_not_compatible(): results = exec.evaluate()(x_data) -def test_tensorrt_serialize_graph_runtime(): +def test_tensorrt_serialize_graph_executor(): if skip_codegen_test(): return import mxnet as mx @@ -273,7 +273,7 @@ def compile_graph(mod, params): return graph, lib, params def run_graph(graph, lib, params): - mod_ = graph_runtime.create(graph, lib, device=tvm.gpu(0)) + mod_ = graph_executor.create(graph, lib, device=tvm.gpu(0)) mod_.load_params(params) mod_.run(data=i_data) res = mod_.get_output(0) @@ -296,7 +296,7 @@ def load_graph(): lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so")) return graph, lib, params - # Test serialization with graph runtime + # Test serialization with graph executor graph, lib, graph_params = compile_graph(mod, params) save_graph(graph, lib, graph_params) loaded_graph, loaded_lib, loaded_params = load_graph() diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py index acc538822c3cd..501ee255c1435 100644 --- a/tests/python/contrib/test_vitis_ai/infrastructure.py +++ b/tests/python/contrib/test_vitis_ai/infrastructure.py @@ -34,7 +34,7 @@ from tvm.relay.op.contrib.vitis_ai import annotation from tvm.relay.build_module import bind_params_by_name from tvm.contrib.target import vitis_ai -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.contrib import utils @@ -154,7 +154,7 @@ def verify_result( lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops) lib = update_lib(lib) - rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu())) + rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu())) for name, data in map_inputs.items(): rt_mod.set_input(name, data) diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py index 2e8807564239b..d0f87fcc21c73 100644 --- a/tests/python/frontend/caffe/test_forward.py +++ b/tests/python/frontend/caffe/test_forward.py @@ -36,7 +36,7 @@ import tvm from tvm import relay -from tvm.contrib import utils, graph_runtime +from tvm.contrib import utils, graph_executor from tvm.contrib.download import download_testdata CURRENT_DIR = os.path.join(os.path.expanduser("~"), ".tvm_test_data", "caffe_test") @@ -205,7 +205,7 @@ def _run_tvm(data, proto_file, blob_file): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) dtype = "float32" - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) if isinstance(data, (tuple, list)): for idx, d in enumerate(data): m.set_input("data" + str(idx), tvm.nd.array(d.astype(dtype))) diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py index 879613f6bd08b..1081b087c4689 100644 --- a/tests/python/frontend/caffe2/test_forward.py +++ b/tests/python/frontend/caffe2/test_forward.py @@ -17,7 +17,7 @@ import numpy as np import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm import relay from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19 from caffe2.python import workspace, core @@ -42,7 +42,7 @@ def get_tvm_output(model, input_data, target, device, output_shape, output_dtype with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](device)) + m = graph_executor.GraphModule(lib["default"](device)) # set inputs m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype))) diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py index 2c1295b28381b..c227c3955c5be 100644 --- a/tests/python/frontend/coreml/test_forward.py +++ b/tests/python/frontend/coreml/test_forward.py @@ -21,7 +21,7 @@ import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm import topi import tvm.topi.testing from tvm import relay @@ -37,7 +37,7 @@ def get_tvm_output( ): with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, target, params=params) - m = graph_runtime.GraphModule(lib["default"](device)) + m = graph_executor.GraphModule(lib["default"](device)) # set inputs m.set_input(input_name, tvm.nd.array(x.astype(dtype))) m.run() @@ -88,9 +88,9 @@ def run_tvm_graph( with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor - m = graph_runtime.GraphModule(lib["default"](device)) + m = graph_executor.GraphModule(lib["default"](device)) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_name): diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py index 72457bf5cd469..3bb8e93d3d221 100644 --- a/tests/python/frontend/darknet/test_forward.py +++ b/tests/python/frontend/darknet/test_forward.py @@ -24,7 +24,7 @@ import numpy as np import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.contrib.download import download_testdata download_testdata.__test__ = False @@ -80,7 +80,7 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None): # Execute on TVM dev = tvm.cpu(0) - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(data.astype(dtype))) if states: diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py index bdd7d7b73d53e..c7f734b891ddb 100644 --- a/tests/python/frontend/keras/test_forward.py +++ b/tests/python/frontend/keras/test_forward.py @@ -18,7 +18,7 @@ import tvm from tvm import te from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import keras import tvm.testing @@ -89,7 +89,7 @@ def get_tvm_output(xs, target, dev, dtype="float32"): mod, params = relay.frontend.from_keras(keras_model, shape_dict, layout=layout) with tvm.transform.PassContext(opt_level=2): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) for name, x in zip(keras_model.input_names, xs): m.set_input(name, tvm.nd.array(x.astype(dtype))) m.run() diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py index 19b31d6fd3ed4..c4e8e804b15ac 100644 --- a/tests/python/frontend/mxnet/test_forward.py +++ b/tests/python/frontend/mxnet/test_forward.py @@ -19,7 +19,7 @@ import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm import relay import mxnet as mx @@ -78,7 +78,7 @@ def get_tvm_output(symbol, x, args, auxs, target, dev, dtype="float32"): ) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) m.run() @@ -802,7 +802,7 @@ def verify(val, shape, dtype): mx_sym = mx.sym.full(shape, val, dtype=dtype) mod, _ = relay.frontend.from_mxnet(mx_sym, {}) for target, dev in tvm.testing.enabled_targets(): - # Skip testing graph runtime because this op will be optimized out + # Skip testing graph executor because this op will be optimized out # by constant folding. for kind in ["debug"]: intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) @@ -994,7 +994,7 @@ def verify( mod, params = relay.frontend.from_mxnet(mx_sym, shape=shape_dict, arg_params=mx_params) for target, dev in tvm.testing.enabled_targets(): - # only test graph runtime because debug runtime is too slow + # only test graph executor because debug runtime is too slow for kind in ["graph"]: intrp = relay.create_executor(kind, mod=mod, device=dev, target=target) op_res = intrp.evaluate()(**inputs, **params) diff --git a/tests/python/frontend/mxnet/test_qnn_ops_utils.py b/tests/python/frontend/mxnet/test_qnn_ops_utils.py index fa5c5d9715ccb..a200e06ed2d04 100644 --- a/tests/python/frontend/mxnet/test_qnn_ops_utils.py +++ b/tests/python/frontend/mxnet/test_qnn_ops_utils.py @@ -18,7 +18,7 @@ import numpy as np import tvm from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.frontend.mxnet_qnn_op_utils import ( dequantize_mxnet_min_max, quantize_mxnet_min_max, @@ -41,7 +41,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -120,7 +120,7 @@ def quantize_test_driver(out_dtype, quant_args, in_data, verify_output_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 772d834bbe273..04b6c94a5f530 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -23,7 +23,7 @@ import tvm.topi.testing import tvm from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import scipy import tvm.testing @@ -78,7 +78,7 @@ def get_tvm_output( with tvm.transform.PassContext(opt_level=1): graph, lib, params = relay.build(mod, target, params=params) - m = graph_runtime.create(graph, lib, device) + m = graph_executor.create(graph, lib, device) # set inputs if isinstance(input_data, list): for i, e in enumerate(input_names): diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py index 29c69abba542c..5b0b65f7b1281 100644 --- a/tests/python/frontend/pytorch/qnn_test.py +++ b/tests/python/frontend/pytorch/qnn_test.py @@ -49,7 +49,7 @@ def get_tvm_runtime(script_module, input_name, ishape): # also not to make CI too slow lib = relay.build(mod, target="llvm", params=params) - runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.cpu(0))) + runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.cpu(0))) return runtime diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 6416043068da3..9ec52987c3549 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -27,7 +27,7 @@ from torch.nn import functional as F import tvm from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.contrib.nvcc import have_fp16 import tvm.testing from packaging import version as package_version @@ -208,7 +208,7 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at with tvm.transform.PassContext(opt_level=3): for target, dev in tvm.testing.enabled_targets(): relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params) - relay_model = graph_runtime.create(relay_graph, relay_lib, dev) + relay_model = graph_executor.create(relay_graph, relay_lib, dev) relay_model.set_input(**relay_params) for name, inp in compiled_input.items(): relay_model.set_input(name, inp) @@ -3590,7 +3590,7 @@ def test_forward_pretrained_bert_base_uncased(): # -------------- dev = tvm.device(target, 0) - relay_model = graph_runtime.create(relay_graph, relay_lib, dev) + relay_model = graph_executor.create(relay_graph, relay_lib, dev) relay_model.set_input(**relay_params) relay_model.set_input(input_1, tokens_tensor) relay_model.set_input(input_2, segments_tensors) diff --git a/tests/python/frontend/tensorflow/test_bn_dynamic.py b/tests/python/frontend/tensorflow/test_bn_dynamic.py index 81a2db4704e62..4eb0d01ef102e 100644 --- a/tests/python/frontend/tensorflow/test_bn_dynamic.py +++ b/tests/python/frontend/tensorflow/test_bn_dynamic.py @@ -66,9 +66,9 @@ def verify_fused_batch_norm(shape): mod, params = relay.frontend.from_tensorflow(constant_graph, outputs=["output"]) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, target=device, params=params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor - m = graph_runtime.create(graph, lib, dev) + m = graph_executor.create(graph, lib, dev) m.set_input(**params) m.set_input("input", data) m.run() diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 4accd16a25672..53f424b922e22 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -110,7 +110,7 @@ def run_tvm_graph( target="llvm", out_names=None, opt_level=3, - mode="graph_runtime", + mode="graph_executor", cuda_layout="NCHW", layout=None, disabled_pass=None, @@ -165,9 +165,9 @@ def run_tvm_graph( else: with tvm.transform.PassContext(opt_level=opt_level, disabled_pass=disabled_pass): graph, lib, params = relay.build(mod, target, target_host, params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor - m = graph_runtime.create(graph, lib, dev) + m = graph_executor.create(graph, lib, dev) # set inputs for e, i in zip(input_node, input_data): if e != "": @@ -207,7 +207,7 @@ def compare_tf_with_tvm( init_global_variables=False, no_gpu=False, opt_level=3, - mode="graph_runtime", + mode="graph_executor", cuda_layout="NCHW", add_shapes_to_graph_def=True, targets=None, @@ -3856,10 +3856,10 @@ def _get_tvm_graph_module(graph_def): target = "llvm" with tvm.transform.PassContext(opt_level=0): graph, lib, params = relay.build(mod, target, params=params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor dev = tvm.cpu(0) - return params, graph_runtime.create(graph, lib, dev) + return params, graph_executor.create(graph, lib, dev) def _do_tvm_sample(model, data, in_states, params, num_samples): """Sampled from the model""" @@ -4073,7 +4073,7 @@ def test_forward_floor(): def test_forward_relu(): ishape = (1, 3, 10, 10) inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32) - for mode in ["graph_runtime", "vm"]: + for mode in ["graph_executor", "vm"]: with tf.Graph().as_default(): in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype) tf.nn.relu(in1) @@ -4083,7 +4083,7 @@ def test_forward_relu(): def test_forward_leaky_relu(): ishape = (1, 3, 10, 10) inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32) - for mode in ["graph_runtime", "vm"]: + for mode in ["graph_executor", "vm"]: with tf.Graph().as_default(): in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype) tf.nn.leaky_relu(in1, alpha=0.4) diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 05e0f076e0793..b02d246d1ae5d 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -160,7 +160,7 @@ def run_tvm_graph( num_output=1, target="llvm", out_names=None, - mode="graph_runtime", + mode="graph_executor", ): """ Generic function to compile on relay and execute on tvm """ # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 @@ -208,9 +208,9 @@ def run_tvm_graph( lib = relay.build(mod, target, params=params) dev = tvm.device(target, 0) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) # set inputs for i, e in enumerate(input_node): m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) @@ -264,7 +264,7 @@ def compare_tflite_with_tvm( out_names=None, quantized=False, input_range=None, - mode="graph_runtime", + mode="graph_executor", experimental_new_converter=False, ): """Generic function to generate and compare TFLite and TVM output""" diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py index 4ecd5dce46498..57fa49e93a04e 100644 --- a/tests/python/nightly/quantization/test_quantization_accuracy.py +++ b/tests/python/nightly/quantization/test_quantization_accuracy.py @@ -98,7 +98,7 @@ def eval_acc( with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(model, target) # create runtime module - m = tvm.contrib.graph_runtime.create(graph, lib, device) + m = tvm.contrib.graph_executor.create(graph, lib, device) m.set_input(**params) # setup evaluaiton metric diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py index c4695f971e97f..44ce9be766d24 100644 --- a/tests/python/relay/benchmarking/benchmark_vm.py +++ b/tests/python/relay/benchmarking/benchmark_vm.py @@ -19,7 +19,7 @@ import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm import relay from tvm.runtime import container from tvm.runtime import vm as vm_rt @@ -36,25 +36,25 @@ def benchmark_execution( dtype="float32", model="unknown", ): - def get_graph_runtime_output( + def get_graph_executor_output( mod, data, params, target, dev, dtype="float32", number=2, repeat=20 ): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target, params=params) - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", data) m.run() out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) if measure: - print("Evaluate graph runtime inference cost of {} on " "{}".format(model, repr(dev))) + print("Evaluate graph executor inference cost of {} on " "{}".format(model, repr(dev))) ftimer = m.module.time_evaluator("run", dev, number=1, repeat=20) # Measure in millisecond. prof_res = np.array(ftimer().results) * 1000 print( - "Mean graph runtime inference time (std dev): %.2f ms (%.2f ms)" + "Mean graph executor inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) ) @@ -82,7 +82,7 @@ def get_vm_output(mod, data, params, target, dev, dtype="float32", number=2, rep data = np.random.uniform(size=data_shape).astype(dtype) for target, dev in testing.enabled_targets(): - tvm_out = get_graph_runtime_output( + tvm_out = get_graph_executor_output( mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype ) vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype) diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py index 95f1177da024d..8466fc1700b02 100644 --- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py +++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py @@ -21,7 +21,7 @@ import tvm from tvm import relay, auto_scheduler -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.testing @@ -169,7 +169,7 @@ def tune_and_check(mod, data, weight): def get_output(data, lib): dev = tvm.cpu() - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py index af930e7f7f19a..1250543a13aee 100644 --- a/tests/python/relay/test_auto_scheduler_tuning.py +++ b/tests/python/relay/test_auto_scheduler_tuning.py @@ -20,7 +20,7 @@ import numpy as np from tvm import auto_scheduler, relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.testing from test_auto_scheduler_task_extraction import get_network @@ -70,7 +70,7 @@ def tune_network(network, target): # Check the correctness def get_output(data, lib): dev = tvm.gpu() - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() return module.get_output(0).asnumpy() diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_executor.py similarity index 97% rename from tests/python/relay/test_backend_graph_runtime.py rename to tests/python/relay/test_backend_graph_executor.py index 6f75c43579a72..b9553d79c3b67 100644 --- a/tests/python/relay/test_backend_graph_runtime.py +++ b/tests/python/relay/test_backend_graph_executor.py @@ -18,7 +18,7 @@ import tvm from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.op import add import tvm.testing @@ -102,7 +102,7 @@ def test_with_params(): y_data = np.random.rand(1, 5).astype("float32") params = {"y": y_data} graph, lib, params = relay.build(tvm.IRModule.from_expr(func), "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input(**params) mod.set_input(x=x_data) mod.run() @@ -174,7 +174,7 @@ def unit_numpy(X, W): for target, dev in tvm.testing.enabled_targets(): with tvm.transform.PassContext(opt_level=2): graph, lib, params = relay.build(tvm.IRModule.from_expr(z), target) - m = graph_runtime.create(graph, lib, dev) + m = graph_executor.create(graph, lib, dev) m.set_input("X", tvm.nd.array(x.astype(dtype))) m.set_input("y", tvm.nd.array(y.astype(dtype))) m.set_input(**params) @@ -194,7 +194,7 @@ def test_compile_nested_tuples(): func = relay.Function([x], out) graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm") - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) x_data = np.random.uniform(size=(10,)).astype(np.float32) mod.set_input(x=x_data) diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py index 6f13533f93185..7d2209a34835b 100644 --- a/tests/python/relay/test_cpp_build_module.py +++ b/tests/python/relay/test_cpp_build_module.py @@ -48,7 +48,7 @@ def test_basic_build(): assert mod["main"] == func_in_mod, "relay.build changed module in-place" # test - rt = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) + rt = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) rt.set_input("a", A) rt.run() out = rt.get_output(0) @@ -85,7 +85,7 @@ def test_fp16_build(): g_json, mmod, params = relay.build(func, "cuda", params=params) # test - rt = tvm.contrib.graph_runtime.create(g_json, mmod, dev) + rt = tvm.contrib.graph_executor.create(g_json, mmod, dev) rt.load_params(runtime.save_param_dict(params)) rt.run() out = rt.get_output(0) @@ -114,7 +114,7 @@ def test_fp16_conversion(target, dev): g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target) # test - rt = tvm.contrib.graph_runtime.create(g_json, mmod, dev) + rt = tvm.contrib.graph_executor.create(g_json, mmod, dev) rt.set_input("x", X) rt.run() out = rt.get_output(0) diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py index 9241f8e827459..9f6d88e47f0b5 100644 --- a/tests/python/relay/test_external_codegen.py +++ b/tests/python/relay/test_external_codegen.py @@ -63,11 +63,11 @@ def check_vm_result(): out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) - def check_graph_runtime_result(): + def check_graph_executor_result(): with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): json, lib, _ = relay.build(mod, target=target) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) + rt_mod = tvm.contrib.graph_executor.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -78,7 +78,7 @@ def check_graph_runtime_result(): tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) check_vm_result() - check_graph_runtime_result() + check_graph_executor_result() def set_external_func_attr(func, compiler, ext_symbol): @@ -336,7 +336,7 @@ def test_extern_dnnl_const(): def test_load_params_with_constants_in_ext_codegen(): # After binding params and partitioning graph_module.get_params() - # might contain parameters that are not an graph runtime input but + # might contain parameters that are not an graph executor input but # for example constants in external function. y_in = np.ones((1,)).astype("float32") params = {"y": y_in} @@ -353,7 +353,7 @@ def test_load_params_with_constants_in_ext_codegen(): graph_module = relay.build(mod, target="llvm", params=params) lib = update_lib(graph_module.get_lib()) - rt_mod = tvm.contrib.graph_runtime.create(graph_module.get_json(), lib, tvm.cpu(0)) + rt_mod = tvm.contrib.graph_executor.create(graph_module.get_json(), lib, tvm.cpu(0)) rt_mod.load_params(runtime.save_param_dict(graph_module.get_params())) diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py index 79d1f710cf416..bf5676d096f1b 100644 --- a/tests/python/relay/test_json_runtime.py +++ b/tests/python/relay/test_json_runtime.py @@ -50,7 +50,7 @@ def check_result( compile_engine.get().clear() with tvm.transform.PassContext(opt_level=3): json, lib, param = relay.build(ref_mod, target=target, params=params) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) + rt_mod = tvm.contrib.graph_executor.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -70,11 +70,11 @@ def check_vm_result(): out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol) - def check_graph_runtime_result(): + def check_graph_executor_result(): compile_engine.get().clear() with relay.build_config(opt_level=3): json, lib, param = relay.build(mod, target=target, params=params) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) + rt_mod = tvm.contrib.graph_executor.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -85,7 +85,7 @@ def check_graph_runtime_result(): tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol) check_vm_result() - check_graph_runtime_result() + check_graph_executor_result() def test_conv2d(): diff --git a/tests/python/relay/test_op_fast_math.py b/tests/python/relay/test_op_fast_math.py index a82efb7ad2ada..7bcbc6839c4f6 100644 --- a/tests/python/relay/test_op_fast_math.py +++ b/tests/python/relay/test_op_fast_math.py @@ -22,7 +22,7 @@ import tvm.relay as relay from tvm import topi from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor def test_fastmath(): @@ -43,7 +43,7 @@ def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"): assert lib.get_function(func_name) dev = tvm.cpu(0) - m = graph_runtime.create(graph, lib, dev) + m = graph_executor.create(graph, lib, dev) # Set inputs m.set_input("x", tvm.nd.array(a_np, dev)) m.set_input(**params) diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index b9e7b075a9985..c5843758c3d2c 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -405,7 +405,7 @@ def run_test_conv2d_cuda( dev = tvm.device(target, 0) params = {"w": tvm.nd.array(kernel)} graph, lib, params = relay.build_module.build(mod, target=target, params=params) - module = tvm.contrib.graph_runtime.create(graph, lib, dev) + module = tvm.contrib.graph_executor.create(graph, lib, dev) module.set_input("x", tvm.nd.array(data)) module.set_input(**params) module.run() @@ -668,7 +668,7 @@ def run_test_conv3d_cuda( dev = tvm.device(target, 0) params = {"w": tvm.nd.array(kernel)} graph, lib, params = relay.build_module.build(mod, target=target, params=params) - module = tvm.contrib.graph_runtime.create(graph, lib, dev) + module = tvm.contrib.graph_executor.create(graph, lib, dev) module.set_input("x", tvm.nd.array(data)) module.set_input(**params) module.run() diff --git a/tests/python/relay/test_op_qnn_concatenate.py b/tests/python/relay/test_op_qnn_concatenate.py index a9207f67fc68d..453875301af9c 100644 --- a/tests/python/relay/test_op_qnn_concatenate.py +++ b/tests/python/relay/test_op_qnn_concatenate.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.topi.testing diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py index 93776d265a49b..e4e02279efd6a 100644 --- a/tests/python/relay/test_op_qnn_conv2_transpose.py +++ b/tests/python/relay/test_op_qnn_conv2_transpose.py @@ -21,7 +21,7 @@ from tvm import relay from tvm.relay import transform from tvm.relay.testing import run_infer_type -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.testing.temp_op_attr import TempOpAttr @@ -191,7 +191,7 @@ def get_output(func, golden_inputs): golden_data, golden_weight = golden_inputs params = {"kernel": golden_weight} graph, lib, params = relay.build(func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py index 87b1f732e2bdd..928450312147b 100644 --- a/tests/python/relay/test_op_qnn_conv2d.py +++ b/tests/python/relay/test_op_qnn_conv2d.py @@ -21,7 +21,7 @@ from tvm import relay from tvm.relay import transform from tvm.relay.testing import run_infer_type -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.testing.temp_op_attr import TempOpAttr # We use llvm target for testing functionality. `llvm` points to an older Intel @@ -198,7 +198,7 @@ def get_output(func, golden_inputs): golden_data, golden_weight = golden_inputs params = {"kernel": golden_weight} graph, lib, params = relay.build(func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -722,7 +722,7 @@ def test_tflite_large_irregular(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -767,7 +767,7 @@ def test_tflite_output_multiplier_greater_than_one(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() @@ -830,7 +830,7 @@ def test_tflite_anistropic_strides(): with tvm.transform.PassContext(opt_level=2): params = {"kernel": golden_weight} graph, lib, params = relay.build(qnn_func, "llvm", params=params) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input("data", golden_data) mod.set_input(**params) mod.run() diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py index ef74ce3edcf61..c47ac6b35ec7c 100644 --- a/tests/python/relay/test_op_qnn_dense.py +++ b/tests/python/relay/test_op_qnn_dense.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.testing.temp_op_attr import TempOpAttr @@ -211,7 +211,7 @@ def qnn_dense_driver(test_configuration): mod = relay.qnn.transform.CanonicalizeOps()(mod) with tvm.transform.PassContext(opt_level=2): graph, lib, params = relay.build(mod, "llvm", params=None) - mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) mod.set_input(quantized_data_name, test_configuration[quantized_data_name]) mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name]) if test_configuration[bias_name] is not None: diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py index 85f97a9c073c5..ab398bbc1316a 100644 --- a/tests/python/relay/test_op_qnn_dequantize.py +++ b/tests/python/relay/test_op_qnn_dequantize.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.testing import run_infer_type @@ -35,7 +35,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data, ax mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -140,7 +140,7 @@ def test_dynamic_dequantize(): with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): lib = relay.build(mod, target=target) - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(**{"x": data, "scale": scale, "zp": zp}) module.run() diff --git a/tests/python/relay/test_op_qnn_mul.py b/tests/python/relay/test_op_qnn_mul.py index 10721b5732348..8ff3ab5c3df2f 100644 --- a/tests/python/relay/test_op_qnn_mul.py +++ b/tests/python/relay/test_op_qnn_mul.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.topi.testing # "unquantize" a quantized tensor diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py index e92344f7dcfaa..2ae688ef47843 100644 --- a/tests/python/relay/test_op_qnn_quantize.py +++ b/tests/python/relay/test_op_qnn_quantize.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.testing import run_infer_type @@ -39,7 +39,7 @@ def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_ mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() @@ -155,7 +155,7 @@ def test_dynamic_quantize(): with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): lib = relay.build(mod, target=target) - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(**{"x": data, "scale": scale, "zp": zp}) module.run() diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py index 5aa3f7f7fdc8a..5e61fad7676d2 100644 --- a/tests/python/relay/test_op_qnn_requantize.py +++ b/tests/python/relay/test_op_qnn_requantize.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor roundings = ["UPWARD", "TONEAREST"] @@ -28,7 +28,7 @@ def verify(mod, goldens): with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) golden_data, golden_output = goldens - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input("quantized_data", golden_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py index 266e9d73b3cbc..3aecd935b62be 100644 --- a/tests/python/relay/test_op_qnn_simulated_dequantize.py +++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.runtime.vm import VirtualMachine from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE @@ -39,7 +39,7 @@ def dequantize_test_driver(in_dtype, quant_args, axis, in_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py index 8a15a037d8bae..fd9d13168e01f 100644 --- a/tests/python/relay/test_op_qnn_simulated_quantize.py +++ b/tests/python/relay/test_op_qnn_simulated_quantize.py @@ -19,7 +19,7 @@ from tvm import te import numpy as np from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.runtime.vm import VirtualMachine from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE @@ -47,7 +47,7 @@ def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data): mod = tvm.IRModule.from_expr(mod) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "llvm", params=None) - rt_mod = graph_runtime.create(graph, lib, device=tvm.cpu(0)) + rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0)) rt_mod.set_input(input_data=in_data) rt_mod.set_input(**params) rt_mod.run() diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py index 7bc4ab422a64c..2272883fc39c3 100644 --- a/tests/python/relay/test_param_dict.py +++ b/tests/python/relay/test_param_dict.py @@ -24,7 +24,7 @@ from tvm.relay.op import add from tvm import relay from tvm import rpc -from tvm.contrib import utils, graph_runtime +from tvm.contrib import utils, graph_executor def test_save_load(): @@ -60,7 +60,7 @@ def test_bigendian_rpc_param(): if host is None: return - def verify_graph_runtime(remote, target, shape, dtype): + def verify_graph_executor(remote, target, shape, dtype): x = relay.var("x") y = relay.const(1) z = relay.add(x, y) @@ -76,7 +76,7 @@ def verify_graph_runtime(remote, target, shape, dtype): remote.upload(path_dso) lib = remote.load_module("dev_lib.o") dev = remote.cpu(0) - mod = graph_runtime.create(graph, lib, dev) + mod = graph_executor.create(graph, lib, dev) mod.load_params(runtime.save_param_dict(params)) mod.run() out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, device=dev)) @@ -86,7 +86,7 @@ def verify_graph_runtime(remote, target, shape, dtype): remote = rpc.connect(host, port) target = "llvm -mtriple=powerpc-linux-gnu" for dtype in ["float32", "float64", "int32", "int8"]: - verify_graph_runtime(remote, target, (10,), dtype) + verify_graph_executor(remote, target, (10,), dtype) if __name__ == "__main__": diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py index 52deb4e89ef97..c756d74ff0bea 100644 --- a/tests/python/relay/test_pass_annotate_target.py +++ b/tests/python/relay/test_pass_annotate_target.py @@ -60,11 +60,11 @@ def check_vm_result(): out = vm.run(**map_inputs) tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) - def check_graph_runtime_result(): + def check_graph_executor_result(): with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): json, lib, param = relay.build(mod, target=target, params=params) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) + rt_mod = tvm.contrib.graph_executor.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -76,7 +76,7 @@ def check_graph_runtime_result(): tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol) check_vm_result() - check_graph_runtime_result() + check_graph_executor_result() def test_extern_dnnl(): diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py index 70f1a0aa52e5e..a9c31f5ccedd3 100644 --- a/tests/python/relay/test_pass_annotation.py +++ b/tests/python/relay/test_pass_annotation.py @@ -20,7 +20,7 @@ import tvm from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay.expr_functor import ExprMutator from tvm.relay import transform import tvm.testing @@ -31,7 +31,7 @@ def _trace(module, metadata, _): pass # import pdb; pdb.set_trace() -def check_graph_runtime( +def check_graph_executor( target, ref_res, device, func, params, config, opt_level, expected_index=None ): with tvm.transform.PassContext(opt_level=opt_level, config=config): @@ -41,7 +41,7 @@ def check_graph_runtime( if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index - mod = graph_runtime.create(graph, lib, contexts) + mod = graph_executor.create(graph, lib, contexts) mod.set_input(**new_params) mod.run() res = mod.get_output(0).asnumpy() @@ -429,7 +429,7 @@ def expected(): check_annotated_graph(annotated_func, expected_func) opt_level = 1 config = {"relay.fallback_device_type": fallback_device.device_type} - check_graph_runtime( + check_graph_executor( target, ref_res, device, annotated_func, params, config, opt_level, expected_index ) opt_level = 2 @@ -465,7 +465,7 @@ def annotated(): check_annotated_graph(annotated_func, expected_func) opt_level = 1 config = {"relay.fallback_device_type": fallback_device.device_type} - check_graph_runtime(target, ref_res, device, annotated_func, params, config, opt_level) + check_graph_executor(target, ref_res, device, annotated_func, params, config, opt_level) opt_level = 2 check_vm_runtime(target, ref_res, device, annotated_func, params, config, opt_level) @@ -506,7 +506,7 @@ def expected(): opt_level = 1 config = {"relay.fallback_device_type": fallback_device.device_type} check_annotated_graph(annotated_func, expected_func) - check_graph_runtime( + check_graph_executor( target, ref_res, device, annotated_func, params, config, opt_level, expected_index ) opt_level = 2 @@ -520,7 +520,7 @@ def test_fallback_all_operators(device, tgt): expected_func = get_func() check_annotated_graph(annotated_func, expected_func) opt_level = 2 - check_graph_runtime(target, ref_res, device, annotated_func, params, {}, opt_level) + check_graph_executor(target, ref_res, device, annotated_func, params, {}, opt_level) check_vm_runtime(target, ref_res, device, annotated_func, params, {}, opt_level) test_fuse_log_add(dev, tgt) @@ -582,7 +582,7 @@ def expected(): opt_level = 0 config = {"relay.fallback_device_type": fallback_device.device_type} - check_graph_runtime( + check_graph_executor( target, ref_res, dev, annotated_func, params, config, opt_level, expected_index ) diff --git a/tests/python/relay/test_pass_legalize.py b/tests/python/relay/test_pass_legalize.py index 0d14f6611db9b..8a37da33a10f4 100644 --- a/tests/python/relay/test_pass_legalize.py +++ b/tests/python/relay/test_pass_legalize.py @@ -20,7 +20,7 @@ from tvm import te from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay import transform, analysis from tvm.relay.testing.temp_op_attr import TempOpAttr diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py index 5ecda4ba07a89..f45e390472387 100644 --- a/tests/python/relay/test_pass_legalize_tensorcore.py +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -20,7 +20,7 @@ from tvm import te from tvm import topi from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay import transform, analysis from tvm.relay.testing.temp_op_attr import TempOpAttr diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index a5bc3ff2cead6..01a1e48f832a5 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -208,12 +208,12 @@ def check_vm_result(): for out, ref in zip(outs, results): tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=tol, atol=tol) - def check_graph_runtime_result(): + def check_graph_executor_result(): compile_engine.get().clear() with tvm.transform.PassContext(opt_level=3): json, lib, param = relay.build(mod, target=target, params=params) lib = update_lib(lib) - rt_mod = tvm.contrib.graph_runtime.create(json, lib, device) + rt_mod = tvm.contrib.graph_executor.create(json, lib, device) for name, data in map_inputs.items(): rt_mod.set_input(name, data) @@ -229,7 +229,7 @@ def check_graph_runtime_result(): tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol) check_vm_result() - check_graph_runtime_result() + check_graph_executor_result() def test_multi_node_compiler(): diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py index 6a5c8f7cd6476..a30cd1e73e3f7 100644 --- a/tests/python/relay/test_pass_qnn_legalize.py +++ b/tests/python/relay/test_pass_qnn_legalize.py @@ -20,7 +20,7 @@ from tvm import te from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay import transform, analysis from tvm.relay.testing.temp_op_attr import TempOpAttr diff --git a/tests/python/relay/test_simplify_fc_transpose.py b/tests/python/relay/test_simplify_fc_transpose.py index ce93a68c73216..fa5f332e6cd5b 100644 --- a/tests/python/relay/test_simplify_fc_transpose.py +++ b/tests/python/relay/test_simplify_fc_transpose.py @@ -31,11 +31,11 @@ def run_func(func, params, x): with tvm.transform.PassContext(opt_level=3): lib = relay.build(func, "llvm", params=params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor dev = tvm.cpu(0) dtype = "float32" - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tests/python/relay/test_sparse_dense_convert.py b/tests/python/relay/test_sparse_dense_convert.py index 4eaaf769ff11a..1efa813ebfb0d 100644 --- a/tests/python/relay/test_sparse_dense_convert.py +++ b/tests/python/relay/test_sparse_dense_convert.py @@ -52,11 +52,11 @@ def run_func(func, params, x): with tvm.transform.PassContext(opt_level=3): graph, lib, new_params = relay.build(func, "llvm", params=params) - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor dev = tvm.cpu(0) dtype = "float32" - m = graph_runtime.create(graph, lib, dev) + m = graph_executor.create(graph, lib, dev) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) m.set_input(**new_params) diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py index 1f49a68aa5194..995cfd2df6668 100644 --- a/tests/python/topi/python/test_topi_qnn.py +++ b/tests/python/topi/python/test_topi_qnn.py @@ -18,7 +18,7 @@ import numpy as np import tvm from tvm import topi, relay, te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.topi.testing @@ -59,7 +59,7 @@ def check_target(target, dev): lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=target) # Get real qnn quantize output. - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) m.set_input("a", a_np) m.run() @@ -126,7 +126,7 @@ def check_target(target, dev): lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=target) # Get real qnn quantize output. - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) m.set_input("a", a_np) m.run() diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py index c9dfef4654ed5..a0524353f3b31 100644 --- a/tests/python/unittest/test_crt.py +++ b/tests/python/unittest/test_crt.py @@ -138,8 +138,8 @@ def test_reset(): @tvm.testing.requires_micro -def test_graph_runtime(): - """Test use of the graph runtime with microTVM.""" +def test_graph_executor(): + """Test use of the graph executor with microTVM.""" import tvm.micro workspace = tvm.micro.Workspace(debug=True) @@ -156,7 +156,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) { factory = tvm.relay.build(relay_mod, target=TARGET) with _make_session(workspace, factory.get_lib()) as sess: - graph_mod = tvm.micro.create_local_graph_runtime( + graph_mod = tvm.micro.create_local_graph_executor( factory.get_json(), sess.get_system_lib(), sess.device ) A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device) diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index 9fde832d08ce1..3ad515604d0b0 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -124,7 +124,7 @@ def _verify_linked_param(dtype, lib, mod, graph, name): """Directly read memory from the linked library to verify the linked parameter is correct.""" sid = _lookup_sid(graph, name) # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend), - # a GraphRuntimeFactory module is created instead of the module itself. + # a GraphExecutorFactory module is created instead of the module itself. param_ptr = mod.get_function("_lookup_linked_param", True)(sid) gen_param = lib.params[name] arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value) @@ -154,7 +154,7 @@ def _add_decl(name, dtype): f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } ) {{", # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an - # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id + # index unequal to its storage_id. This ensures that GraphExecutorCodegen encodes the storage_id # and not the parameter index into the graph. ( f' %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", ' @@ -206,7 +206,7 @@ def test_llvm_link_params(): # Wrap in function to explicitly deallocate the runtime. def _run_linked(lib, mod): graph_json, _, _ = lib - graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0)) + graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0)) graph_rt.set_input("rand_input", rand_input) # NOTE: params not required. graph_rt.run() return graph_rt.get_output(0) @@ -218,7 +218,7 @@ def _run_linked(lib, mod): def _run_unlinked(lib): graph_json, mod, lowered_params = lib - graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0)) + graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0)) graph_rt.set_input("rand_input", rand_input, **lowered_params) graph_rt.run() return graph_rt.get_output(0) @@ -316,7 +316,7 @@ def test_c_link_params(): # Wrap in function to explicitly deallocate the runtime. def _run_linked(lib_mod): - graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0))) + graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input) # NOTE: params not required. graph_rt.run() @@ -334,7 +334,7 @@ def _run_linked(lib_mod): lib_mod = tvm.runtime.load_module(lib_path) def _run_unlinked(lib_mod): - graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0))) + graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input, **params) graph_rt.run() return graph_rt.get_output(0) @@ -365,7 +365,7 @@ def test_crt_link_params(): opts = tvm.micro.default_options( os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host") ) - opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE") + opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE") micro_binary = tvm.micro.build_static_runtime( workspace, @@ -374,7 +374,7 @@ def test_crt_link_params(): compiler_options=opts, extra_libs=[ tvm.micro.get_standalone_crt_lib(m) - for m in ("memory", "graph_runtime_module", "graph_runtime") + for m in ("memory", "graph_executor_module", "graph_executor") ], ) @@ -383,7 +383,7 @@ def test_crt_link_params(): } flasher = compiler.flasher(**flasher_kw) with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess: - graph_rt = tvm.micro.session.create_local_graph_runtime( + graph_rt = tvm.micro.session.create_local_graph_executor( graph_json, sess.get_system_lib(), sess.device ) @@ -397,7 +397,7 @@ def test_crt_link_params(): def _run_unlinked(lib): graph_json, mod, lowered_params = lib - graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0)) + graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0)) graph_rt.set_input("rand_input", rand_input, **lowered_params) graph_rt.run() return graph_rt.get_output(0).asnumpy() diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py index c999091cc3cce..db6c55bca12ae 100644 --- a/tests/python/unittest/test_micro_model_library_format.py +++ b/tests/python/unittest/test_micro_model_library_format.py @@ -26,7 +26,7 @@ import tvm import tvm.relay -from tvm.relay.backend import graph_runtime_factory +from tvm.relay.backend import graph_executor_factory import tvm.runtime.module import tvm.testing from tvm.contrib import utils @@ -170,7 +170,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ @tvm.testing.requires_micro def test_export_model(): module = tvm.support.FrontendTestModule() - factory = graph_runtime_factory.GraphRuntimeFactoryModule( + factory = graph_executor_factory.GraphExecutorFactoryModule( None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {} ) diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py index f37d4089a8ee3..5f0c7837d4f55 100644 --- a/tests/python/unittest/test_runtime_graph.py +++ b/tests/python/unittest/test_runtime_graph.py @@ -21,7 +21,7 @@ import json from tvm import rpc from tvm import relay -from tvm.contrib import utils, graph_runtime +from tvm.contrib import utils, graph_executor @tvm.testing.requires_llvm @@ -59,7 +59,7 @@ def test_graph_simple(): def check_verify(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") - mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) + mod = graph_executor.create(graph, mlib, tvm.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=a) out = mod.get_output(0, tvm.nd.empty((n,))) @@ -75,7 +75,7 @@ def check_remote(): mlib.export_library(path_dso) remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") - mod = graph_runtime.create(graph, mlib, remote.cpu(0)) + mod = graph_executor.create(graph, mlib, remote.cpu(0)) a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, dev)) out = tvm.nd.empty((n,), device=dev) @@ -92,10 +92,10 @@ def check_sharing(): params = {"x": x_in} graph, lib, params = relay.build(func, target="llvm", params=params) - mod_shared = graph_runtime.create(graph, lib, tvm.cpu(0)) + mod_shared = graph_executor.create(graph, lib, tvm.cpu(0)) mod_shared.load_params(runtime.save_param_dict(params)) num_mods = 10 - mods = [graph_runtime.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)] + mods = [graph_executor.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)] for mod in mods: mod.share_params(mod_shared, runtime.save_param_dict(params)) @@ -120,7 +120,7 @@ def check_sharing(): def test_load_unexpected_params(): - # Test whether graph_runtime.load_params works if parameters + # Test whether graph_executor.load_params works if parameters # are provided that are not an expected input. mod = tvm.IRModule() params = {} @@ -130,7 +130,7 @@ def test_load_unexpected_params(): mod["main"] = relay.Function([x, y], z) graph_module = relay.build(mod, target="llvm", params=params) - rt_mod = tvm.contrib.graph_runtime.create( + rt_mod = tvm.contrib.graph_executor.create( graph_module.get_json(), graph_module.get_lib(), tvm.cpu(0) ) diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py index 41c782a91d9b7..ee7750e3e142e 100644 --- a/tests/python/unittest/test_runtime_graph_cuda_graph.py +++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py @@ -27,8 +27,8 @@ from tvm import te import numpy as np -from tvm.contrib import utils, graph_runtime -from tvm.contrib.cuda_graph import cuda_graph_runtime +from tvm.contrib import utils, graph_executor +from tvm.contrib.cuda_graph import cuda_graph_executor bx = te.thread_axis("blockIdx.x") @@ -75,7 +75,7 @@ def check_verify(): mlib = tvm.build(s, [A, B], "cuda", name="myadd") dev = tvm.gpu(0) try: - mod = cuda_graph_runtime.create(graph, mlib, dev) + mod = cuda_graph_executor.create(graph, mlib, dev) except ValueError: return diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py index b0173d1e7b7f2..6cab75d9b9fcf 100644 --- a/tests/python/unittest/test_runtime_graph_debug.py +++ b/tests/python/unittest/test_runtime_graph_debug.py @@ -28,7 +28,7 @@ import numpy as np from tvm import rpc from tvm.contrib import utils -from tvm.contrib.debugger import debug_runtime +from tvm.contrib.debugger import debug_executor @tvm.testing.requires_llvm @@ -75,7 +75,7 @@ def myadd(*args): mlib_proxy = tvm.support.FrontendTestModule() mlib_proxy["myadd"] = myadd try: - mod = debug_runtime.create(graph, mlib_proxy, tvm.cpu(0)) + mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0)) except ValueError: return @@ -171,7 +171,7 @@ def check_remote(): remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") try: - mod = debug_runtime.create(graph, mlib, remote.cpu(0)) + mod = debug_executor.create(graph, mlib, remote.cpu(0)) except ValueError: print("Skip because debug runtime not enabled") return diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py index b3fbc2fdf2483..e97b349af36ed 100644 --- a/tests/python/unittest/test_runtime_heterogeneous.py +++ b/tests/python/unittest/test_runtime_heterogeneous.py @@ -21,7 +21,7 @@ import tvm from tvm import te -from tvm.contrib import graph_runtime, utils +from tvm.contrib import graph_executor, utils from tvm import topi @@ -172,7 +172,7 @@ def check_device(device, target_device): target_flist = {target_device: lower_add, target_host: lower_sub} mhost = tvm.build(target_flist, target_host=target_host) dev = [host_dev, device_dev] - mod = graph_runtime.create(graph, mhost, dev) + mod = graph_executor.create(graph, mhost, dev) params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype) @@ -408,7 +408,7 @@ def check_device(device, target_device): params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype) def check_verify(): - mod = graph_runtime.create(graph, mhost, dev) + mod = graph_executor.create(graph, mhost, dev) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) @@ -422,7 +422,7 @@ def check_load_module(): out_file.write(graph) loaded_lib = tvm.runtime.load_module(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() - mod = graph_runtime.create(loaded_graph, loaded_lib, dev) + mod = graph_executor.create(loaded_graph, loaded_lib, dev) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py index a9f8f6352d359..766338de35580 100644 --- a/tests/python/unittest/test_runtime_module_based_interface.py +++ b/tests/python/unittest/test_runtime_module_based_interface.py @@ -18,9 +18,9 @@ from tvm import relay, runtime from tvm.relay import testing import tvm -from tvm.contrib import graph_runtime -from tvm.contrib.debugger import debug_runtime -from tvm.contrib.cuda_graph import cuda_graph_runtime +from tvm.contrib import graph_executor +from tvm.contrib.debugger import debug_executor +from tvm.contrib.cuda_graph import cuda_graph_executor import tvm.testing @@ -37,7 +37,7 @@ def verify(data): graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params) dev = tvm.cpu() - module = graph_runtime.create(graph, lib, dev) + module = graph_executor.create(graph, lib, dev) module.set_input("data", data) module.set_input(**graph_params) module.run() @@ -55,7 +55,7 @@ def test_legacy_compatibility(): graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") dev = tvm.cpu() - module = graph_runtime.create(graph, lib, dev) + module = graph_executor.create(graph, lib, dev) module.set_input("data", data) module.set_input(**graph_params) module.run() @@ -82,8 +82,8 @@ def test_cpu(): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(complied_graph_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -109,8 +109,8 @@ def test_gpu(): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(complied_graph_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -151,8 +151,8 @@ def verify_cpu_export(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -190,8 +190,8 @@ def verify_gpu_export(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -234,8 +234,8 @@ def verify_rpc_cpu_export(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -279,8 +279,8 @@ def verify_rpc_gpu_export(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) gmod.set_input("data", data) gmod.run() out = gmod.get_output(0).asnumpy() @@ -333,8 +333,8 @@ def verify_cpu_remove_package_params(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -380,8 +380,8 @@ def verify_gpu_remove_package_params(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -433,8 +433,8 @@ def verify_rpc_cpu_remove_package_params(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -486,8 +486,8 @@ def verify_rpc_gpu_remove_package_params(obj_format): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # graph runtime wrapper - gmod = graph_runtime.GraphModule(loaded_lib["default"](dev)) + # graph executor wrapper + gmod = graph_executor.GraphModule(loaded_lib["default"](dev)) loaded_params = bytearray(open(path_params, "rb").read()) gmod.set_input("data", data) gmod.load_params(loaded_params) @@ -502,7 +502,7 @@ def verify_rpc_gpu_remove_package_params(obj_format): verify_rpc_gpu_remove_package_params(obj_format) -def test_debug_graph_runtime(): +def test_debug_graph_executor(): if not tvm.testing.device_enabled("llvm"): print("Skip because llvm is not enabled") return @@ -516,7 +516,7 @@ def test_debug_graph_runtime(): try: gmod = complied_graph_lib["debug_create"]("default", dev) except: - print("Skip because debug graph_runtime not enabled") + print("Skip because debug graph_executor not enabled") return set_input = gmod["set_input"] run = gmod["run"] @@ -526,8 +526,8 @@ def test_debug_graph_runtime(): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # debug graph runtime wrapper - debug_g_mod = debug_runtime.GraphModuleDebug( + # debug graph executor wrapper + debug_g_mod = debug_executor.GraphModuleDebug( complied_graph_lib["debug_create"]("default", dev), [dev], complied_graph_lib.get_json(), @@ -540,7 +540,7 @@ def test_debug_graph_runtime(): @tvm.testing.requires_cudagraph -def test_cuda_graph_runtime(): +def test_cuda_graph_executor(): mod, params = relay.testing.synthetic.get_workload() with tvm.transform.PassContext(opt_level=3): complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) @@ -560,8 +560,8 @@ def test_cuda_graph_runtime(): out = get_output(0).asnumpy() tvm.testing.assert_allclose(out, verify(data), atol=1e-5) - # cuda graph runtime wrapper - cu_gmod = cuda_graph_runtime.GraphModuleCudaGraph(gmod) + # cuda graph executor wrapper + cu_gmod = cuda_graph_executor.GraphModuleCudaGraph(gmod) cu_gmod.set_input("data", data) cu_gmod.run() out = cu_gmod.get_output(0).asnumpy() @@ -602,5 +602,5 @@ def make_module(mod): test_gpu() test_mod_export() test_remove_package_params() - test_debug_graph_runtime() + test_debug_graph_executor() test_multiple_imported_modules() diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py index 3429b39c6ac38..f1290ddd1e51a 100644 --- a/tests/python/unittest/test_target_codegen_blob.py +++ b/tests/python/unittest/test_target_codegen_blob.py @@ -18,7 +18,7 @@ import numpy as np from tvm import relay from tvm.relay import testing -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm from tvm import te import ctypes @@ -39,7 +39,7 @@ def verify(data): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, "llvm", params=params) dev = tvm.cpu() - module = graph_runtime.GraphModule(lib["default"](dev)) + module = graph_executor.GraphModule(lib["default"](dev)) module.set_input("data", data) module.run() out = module.get_output(0).asnumpy() @@ -58,7 +58,7 @@ def verify(data): loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape).astype("float32") dev = tvm.gpu() - module = graph_runtime.GraphModule(loaded_lib["default"](dev)) + module = graph_executor.GraphModule(loaded_lib["default"](dev)) module.set_input("data", data) module.run() out = module.get_output(0).asnumpy() diff --git a/tests/python/unittest/test_tir_transform_hoist_if.py b/tests/python/unittest/test_tir_transform_hoist_if.py index 748a33d977ee4..7d02e4f12c1d4 100644 --- a/tests/python/unittest/test_tir_transform_hoist_if.py +++ b/tests/python/unittest/test_tir_transform_hoist_if.py @@ -765,7 +765,7 @@ def test_hoisting_op_conv(): for target, dev in enabled_targets(): with tvm.transform.PassContext(opt_level=3): lib = relay.build_module.build(mod, target=target, params=params) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) + m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) x = np.random.uniform(size=dshape) data_tvm = tvm.nd.array(data) m.set_input("x", data_tvm) @@ -779,7 +779,7 @@ def test_hoisting_op_conv(): opt_level=3, config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}} ): lib = relay.build_module.build(mod, target=target, params=params) - m = tvm.contrib.graph_runtime.GraphModule(lib["default"](dev)) + m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) x = np.random.uniform(size=dshape) data_tvm = tvm.nd.array(data) m.set_input("x", data_tvm) diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index 7338555c4c942..609325c9962b5 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -34,7 +34,7 @@ echo set\(USE_NNPACK ON\) >> config.cmake echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake echo set\(USE_RPC ON\) >> config.cmake echo set\(USE_SORT ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake +echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh index 7ab4afae3c2e5..7818d7d458d63 100755 --- a/tests/scripts/task_java_unittest.sh +++ b/tests/scripts/task_java_unittest.sh @@ -32,7 +32,7 @@ TEMP_DIR=$(mktemp -d) python3 $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR python3 $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR -python3 $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR +python3 $SCRIPT_DIR/test_graph_executor.py $TEMP_DIR # start rpc proxy server PORT=$(( ( RANDOM % 1000 ) + 9000 )) diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh index 2c87cceec8bb6..c40585b62b47f 100755 --- a/tests/scripts/task_rust.sh +++ b/tests/scripts/task_rust.sh @@ -58,14 +58,14 @@ cd $RUST_DIR/tvm-rt cargo build cargo test --tests -# Next we test the graph runtime crate. +# Next we test the graph executor crate. cd $RUST_DIR/tvm-graph-rt # We first we compile a model using the Python bindings then run the tests. python3 tests/build_model.py cargo test --tests -# Run some more tests involving the graph runtime API. +# Run some more tests involving the graph executor API. cd tests/test_tvm_basic cargo run cd - diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py index 30bbce1e6b7e0..153143dd4e94c 100644 --- a/tutorials/auto_scheduler/tune_network_arm.py +++ b/tutorials/auto_scheduler/tune_network_arm.py @@ -49,7 +49,7 @@ import tvm from tvm import relay, auto_scheduler import tvm.relay.testing -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.contrib.utils import tempdir ################################################################# @@ -319,9 +319,9 @@ def tune_and_evaluate(): remote.upload(tmp.relpath(filename)) rlib = remote.load_module(filename) - # Create graph runtime + # Create graph executor dev = remote.cpu() - module = graph_runtime.GraphModule(rlib["default"](dev)) + module = graph_executor.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index b5162fc85f148..7b5619c671be5 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -49,7 +49,7 @@ import tvm from tvm import relay, auto_scheduler import tvm.relay.testing -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor ################################################################# # Define a Network @@ -280,9 +280,9 @@ def run_tuning(): with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) -# Create graph runtime +# Create graph executor dev = tvm.device(str(target), 0) -module = graph_runtime.GraphModule(lib["default"](dev)) +module = graph_executor.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index 3d3861263e4b7..13d1e4793ffa3 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -49,7 +49,7 @@ import tvm from tvm import relay, auto_scheduler import tvm.relay.testing -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import os ################################################################# @@ -242,7 +242,7 @@ def tune_and_evaluate(): ): lib = relay.build(mod, target=target, target_host=target_host, params=params) - # Create graph runtime + # Create graph executor print("=============== Request Remote ===============") from tvm.auto_scheduler.utils import request_remote @@ -256,7 +256,7 @@ def tune_and_evaluate(): lib.export_library(path_lib, ndk.create_shared) remote.upload(path_lib) loaded_lib = remote.load_module(filename) - module = graph_runtime.GraphModule(loaded_lib["default"](dev)) + module = graph_executor.GraphModule(loaded_lib["default"](dev)) data = (np.random.uniform(size=input_shape)).astype(dtype) data_tvm = tvm.nd.array(data) module.set_input("data", data_tvm) diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index ca2b8c1977788..91dc64eec20ec 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -49,7 +49,7 @@ import tvm from tvm import relay, auto_scheduler import tvm.relay.testing -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor ################################################################# # Define a Network @@ -279,9 +279,9 @@ def run_tuning(): with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): lib = relay.build(mod, target=target, params=params) -# Create graph runtime +# Create graph executor dev = tvm.device(str(target), 0) -module = graph_runtime.GraphModule(lib["default"](dev)) +module = graph_executor.GraphModule(lib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py index 9ddcf817c1683..9223eb30cd9d9 100644 --- a/tutorials/autotvm/tune_relay_arm.py +++ b/tutorials/autotvm/tune_relay_arm.py @@ -70,7 +70,7 @@ import tvm.relay.testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.contrib.utils import tempdir -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime ################################################################# # Define network diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py index 8ae5144eb31db..50485c4d7ff2f 100644 --- a/tutorials/autotvm/tune_relay_cuda.py +++ b/tutorials/autotvm/tune_relay_cuda.py @@ -67,7 +67,7 @@ from tvm import relay, autotvm import tvm.relay.testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime ################################################################# # Define Network diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py index 08fc87d4da1b3..aefa600e3c3f7 100644 --- a/tutorials/autotvm/tune_relay_mobile_gpu.py +++ b/tutorials/autotvm/tune_relay_mobile_gpu.py @@ -69,7 +69,7 @@ import tvm.relay.testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.contrib.utils import tempdir -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime ################################################################# # Define network diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py index b072a3adfea33..dd5d4057c211e 100644 --- a/tutorials/autotvm/tune_relay_x86.py +++ b/tutorials/autotvm/tune_relay_x86.py @@ -36,7 +36,7 @@ from tvm.relay import testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner -import tvm.contrib.graph_runtime as runtime +import tvm.contrib.graph_executor as runtime ################################################################# # Define network diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py index b21c09692a64d..e73dc2dca2875 100644 --- a/tutorials/frontend/build_gcn.py +++ b/tutorials/frontend/build_gcn.py @@ -175,7 +175,7 @@ def evaluate(data, logits): # = ((H * W)^t * A^t)^t # = ((W^t * H^t) * A^t)^t from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm from tvm import te @@ -335,9 +335,9 @@ def prepare_params(g, data): with tvm.transform.PassContext(opt_level=0): # Currently only support opt_level=0 lib = relay.build(mod, target, params=params) -# Generate graph runtime +# Generate graph executor dev = tvm.device(target, 0) -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) ###################################################################### # Run the TVM model, test for accuracy and verify with DGL diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py index c69ac8d3f0e3a..8efcb706b3800 100644 --- a/tutorials/frontend/deploy_model_on_android.py +++ b/tutorials/frontend/deploy_model_on_android.py @@ -34,7 +34,7 @@ from tvm import te import tvm.relay as relay from tvm import rpc -from tvm.contrib import utils, ndk, graph_runtime as runtime +from tvm.contrib import utils, ndk, graph_executor as runtime from tvm.contrib.download import download_testdata @@ -71,7 +71,7 @@ # -DUSE_RPC=ON \ # -DUSE_SORT=ON \ # -DUSE_VULKAN=ON \ -# -DUSE_GRAPH_RUNTIME=ON \ +# -DUSE_GRAPH_EXECUTOR=ON \ # .. # make -j10 # diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py index 75e142243a387..a59665f62f1c7 100644 --- a/tutorials/frontend/deploy_model_on_rasp.py +++ b/tutorials/frontend/deploy_model_on_rasp.py @@ -30,7 +30,7 @@ from tvm import te import tvm.relay as relay from tvm import rpc -from tvm.contrib import utils, graph_runtime as runtime +from tvm.contrib import utils, graph_executor as runtime from tvm.contrib.download import download_testdata ###################################################################### diff --git a/tutorials/frontend/deploy_prequantized.py b/tutorials/frontend/deploy_prequantized.py index 681bc07413381..308027a4a1937 100644 --- a/tutorials/frontend/deploy_prequantized.py +++ b/tutorials/frontend/deploy_prequantized.py @@ -90,7 +90,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) - runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.device(target, 0))) + runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.device(target, 0))) runtime.set_input(input_name, inp) runtime.run() diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/tutorials/frontend/deploy_prequantized_tflite.py index 547704a72fc44..e0f9a6b2ebdea 100644 --- a/tutorials/frontend/deploy_prequantized_tflite.py +++ b/tutorials/frontend/deploy_prequantized_tflite.py @@ -168,9 +168,9 @@ def run_tflite_model(tflite_model_buf, input_data): ############################################################################### # Lets run TVM compiled pre-quantized model inference and get the TVM prediction. def run_tvm(lib): - from tvm.contrib import graph_runtime + from tvm.contrib import graph_executor - rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0))) + rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu(0))) rt_mod.set_input("input", data) rt_mod.run() tvm_res = rt_mod.get_output(0).asnumpy() diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py index 4d96f4d5de9f1..1fcb1b3246da4 100644 --- a/tutorials/frontend/deploy_sparse.py +++ b/tutorials/frontend/deploy_sparse.py @@ -82,7 +82,7 @@ import numpy as np import tensorflow as tf from tvm import relay, runtime -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.relay import data_dep_optimization as ddo from tensorflow.python.framework.convert_to_constants import ( convert_variables_to_constants_v2, @@ -214,7 +214,7 @@ def run_relay_graph(mod, params, shape_dict, target, dev): input_shape = shape_dict["input_1"] dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32") - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) m.set_input(0, dummy_data) m.run() tvm_output = m.get_output(0) diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py index 9f31ab70731ac..40b40ce1f441a 100644 --- a/tutorials/frontend/deploy_ssd_gluoncv.py +++ b/tutorials/frontend/deploy_ssd_gluoncv.py @@ -28,7 +28,7 @@ from matplotlib import pyplot as plt from tvm import relay -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor from tvm.contrib.download import download_testdata from gluoncv import model_zoo, data, utils @@ -102,7 +102,7 @@ def build(target): def run(lib, dev): # Build TVM runtime - m = graph_runtime.GraphModule(lib["default"](dev)) + m = graph_executor.GraphModule(lib["default"](dev)) tvm_input = tvm.nd.array(x.asnumpy(), device=dev) m.set_input("data", tvm_input) # execute diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py index 390fd2f320358..a3378de8b0e31 100644 --- a/tutorials/frontend/from_caffe2.py +++ b/tutorials/frontend/from_caffe2.py @@ -105,12 +105,12 @@ def transform_image(image): # The process is no different from other examples. import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor # context x86 CPU, use tvm.gpu(0) if you run on GPU dev = tvm.cpu(0) # create a runtime executor module -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input(input_name, tvm.nd.array(data.astype("float32"))) # execute diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py index 5703f5742d3dd..ea8817d3a0a80 100644 --- a/tutorials/frontend/from_coreml.py +++ b/tutorials/frontend/from_coreml.py @@ -81,11 +81,11 @@ # Execute on TVM # ------------------- # The process is no different from other example -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor dev = tvm.cpu(0) dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("image", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py index b76c32e831378..356dc16bedf0b 100644 --- a/tutorials/frontend/from_darknet.py +++ b/tutorials/frontend/from_darknet.py @@ -117,9 +117,9 @@ # Execute on TVM Runtime # ---------------------- # The process is no different from other examples. -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(data.astype(dtype))) diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py index 696af6b4a3a53..bfaac2c6c98e2 100644 --- a/tutorials/frontend/from_mxnet.py +++ b/tutorials/frontend/from_mxnet.py @@ -104,11 +104,11 @@ def transform_image(image): # Execute the portable graph on TVM # --------------------------------- # Now, we would like to reproduce the same forward computation using TVM. -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor dev = tvm.gpu(0) dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("data", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py index ecc3e9aef40c7..a0db518025e30 100644 --- a/tutorials/frontend/from_pytorch.py +++ b/tutorials/frontend/from_pytorch.py @@ -110,10 +110,10 @@ # Execute the portable graph on TVM # --------------------------------- # Now we can try deploying the compiled model on target. -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor dtype = "float32" -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # Set inputs m.set_input(input_name, tvm.nd.array(img.astype(dtype))) # Execute diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py index 5a7c98105715f..96c001e4fd416 100644 --- a/tutorials/frontend/from_tensorflow.py +++ b/tutorials/frontend/from_tensorflow.py @@ -152,10 +152,10 @@ # --------------------------------- # Now we can try deploying the compiled model on target. -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor dtype = "uint8" -m = graph_runtime.GraphModule(lib["default"](dev)) +m = graph_executor.GraphModule(lib["default"](dev)) # set inputs m.set_input("DecodeJpeg/contents", tvm.nd.array(x.astype(dtype))) # execute diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py index f7e8422c37b63..a85cfcea913cc 100644 --- a/tutorials/frontend/from_tflite.py +++ b/tutorials/frontend/from_tflite.py @@ -148,7 +148,7 @@ def extract(path): # -------------- import tvm from tvm import te -from tvm.contrib import graph_runtime as runtime +from tvm.contrib import graph_executor as runtime # Create a runtime executor module module = runtime.GraphModule(lib["default"](tvm.cpu())) diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py index 667dc2de1b9b2..232f618bb28a1 100644 --- a/tutorials/frontend/using_external_lib.py +++ b/tutorials/frontend/using_external_lib.py @@ -34,7 +34,7 @@ import tvm from tvm import te import numpy as np -from tvm.contrib import graph_runtime as runtime +from tvm.contrib import graph_executor as runtime from tvm import relay from tvm.relay import testing import tvm.testing diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py index 86c4ac4399ed6..fa9207604bac6 100644 --- a/tutorials/get_started/relay_quick_start.py +++ b/tutorials/get_started/relay_quick_start.py @@ -43,7 +43,7 @@ from tvm.relay import testing import tvm from tvm import te -from tvm.contrib import graph_runtime +from tvm.contrib import graph_executor import tvm.testing ###################################################################### @@ -104,13 +104,13 @@ ##################################################################### # Run the generate library # ------------------------ -# Now we can create graph runtime and run the module on Nvidia GPU. +# Now we can create graph executor and run the module on Nvidia GPU. # create random input dev = tvm.gpu() data = np.random.uniform(-1, 1, size=data_shape).astype("float32") # create module -module = graph_runtime.GraphModule(lib["default"](dev)) +module = graph_executor.GraphModule(lib["default"](dev)) # set input and parameters module.set_input("data", data) # run @@ -143,7 +143,7 @@ loaded_lib = tvm.runtime.load_module(path_lib) input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32")) -module = graph_runtime.GraphModule(loaded_lib["default"](dev)) +module = graph_executor.GraphModule(loaded_lib["default"](dev)) module.run(data=input_data) out_deploy = module.get_output(0).asnumpy() diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py index 53271b29e20d0..f59b1c3723a8c 100644 --- a/tutorials/micro/micro_tflite.py +++ b/tutorials/micro/micro_tflite.py @@ -127,7 +127,7 @@ import tvm import tvm.micro as micro from tvm.contrib.download import download_testdata -from tvm.contrib import graph_runtime, utils +from tvm.contrib import graph_executor, utils from tvm import relay model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite" @@ -257,7 +257,7 @@ flasher = compiler.flasher() with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session: - graph_mod = tvm.micro.create_local_graph_runtime( + graph_mod = tvm.micro.create_local_graph_executor( graph, session.get_system_lib(), session.device ) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index a10d1de8c46bc..dfb74b1297180 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -28,8 +28,8 @@ from tvm import rpc, autotvm, relay from tvm.autotvm.measure.measure_methods import request_remote from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.contrib import graph_runtime, utils, download -from tvm.contrib.debugger import debug_runtime +from tvm.contrib import graph_executor, utils, download +from tvm.contrib.debugger import debug_executor import vta from vta.testing import simulator from vta.top import graph_pack @@ -325,9 +325,9 @@ def tune_tasks( # If detailed runtime info is needed build with debug runtime if opt.debug_profile: - m = debug_runtime.create(graph, lib, ctx) + m = debug_executor.create(graph, lib, ctx) else: - m = graph_runtime.create(graph, lib, ctx) + m = graph_executor.create(graph, lib, ctx) # Set the network parameters and synthetic input image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index ed2671c75ae85..7deb7408479a0 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -62,7 +62,7 @@ import tvm from tvm import te from tvm import rpc, autotvm, relay -from tvm.contrib import graph_runtime, utils, download +from tvm.contrib import graph_executor, utils, download from tvm.autotvm.measure.measure_methods import request_remote from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner @@ -431,9 +431,9 @@ def tune_and_evaluate(tuning_opt): remote.upload(temp.relpath("graphlib.tar")) lib = remote.load_module("graphlib.tar") - # Generate the graph runtime + # Generate the graph executor ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) - m = graph_runtime.GraphModule(lib["default"](ctx)) + m = graph_executor.GraphModule(lib["default"](ctx)) # upload parameters to device image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 1bf4161a3340a..f9db824eafa3d 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -52,8 +52,8 @@ import tvm from tvm import te from tvm import rpc, autotvm, relay -from tvm.contrib import graph_runtime, utils, download -from tvm.contrib.debugger import debug_runtime +from tvm.contrib import graph_executor, utils, download +from tvm.contrib.debugger import debug_executor from tvm.relay import transform import vta @@ -135,7 +135,7 @@ ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### -# Build the inference graph runtime +# Build the inference graph executor # --------------------------------- # Grab vision model from Gluon model zoo and compile with Relay. # The compilation steps are: @@ -147,7 +147,7 @@ # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply). # 5. Perform relay build to object file. # 6. Load the object file onto remote (FPGA device). -# 7. Generate graph runtime, `m`. +# 7. Generate graph executor, `m`. # # Load pre-configured AutoTVM schedules @@ -209,8 +209,8 @@ remote.upload(temp.relpath("graphlib.tar")) lib = remote.load_module("graphlib.tar") - # Graph runtime - m = graph_runtime.GraphModule(lib["default"](ctx)) + # Graph executor + m = graph_executor.GraphModule(lib["default"](ctx)) ###################################################################### # Perform image classification inference diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/legacy/deploy_detection.py index cbb320e98f132..696d0508b9563 100644 --- a/vta/tutorials/frontend/legacy/deploy_detection.py +++ b/vta/tutorials/frontend/legacy/deploy_detection.py @@ -58,7 +58,7 @@ from tvm import rpc, autotvm, relay from tvm.relay.testing import yolo_detection, darknet from tvm.relay.testing.darknet import __darknetffi__ -from tvm.contrib import graph_runtime, utils +from tvm.contrib import graph_executor, utils from tvm.contrib.download import download_testdata from vta.testing import simulator from vta.top import graph_pack @@ -178,7 +178,7 @@ ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) #################################### -# Build the inference graph runtime. +# Build the inference graph executor. # ---------------------------------- # Using Darknet library load downloaded vision model and compile with Relay. # The compilation steps are: @@ -190,7 +190,7 @@ # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply). # 5. Perform relay build to object file. # 6. Load the object file onto remote (FPGA device). -# 7. Generate graph runtime, `m`. +# 7. Generate graph executor, `m`. # # Load pre-configured AutoTVM schedules @@ -246,8 +246,8 @@ remote.upload(temp.relpath("graphlib.tar")) lib = remote.load_module("graphlib.tar") - # Graph runtime - m = graph_runtime.GraphModule(lib["default"](ctx)) + # Graph executor + m = graph_executor.GraphModule(lib["default"](ctx)) #################################### # Perform image detection inference. diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index ba880e7b5c0ae..bcad656678e51 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -34,7 +34,7 @@ #include "src/runtime/c_runtime_api.cc" #include "src/runtime/cpu_device_api.cc" #include "src/runtime/file_utils.cc" -#include "src/runtime/graph/graph_runtime.cc" +#include "src/runtime/graph_executor/graph_executor.cc" #include "src/runtime/library_module.cc" #include "src/runtime/logging.cc" #include "src/runtime/module.cc" diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 968dd9acbb5be..a76096ebba4df 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -570,13 +570,13 @@ export class Module implements Disposable { } /** - * Graph runtime. + * Graph executor. * * This is a thin wrapper of the underlying TVM module. * you can also directly call set_input, run, and get_output * of underlying module functions */ -class GraphRuntime implements Disposable { +class GraphExecutor implements Disposable { module: Module; private packedSetInput: PackedFunc; private packedRun: PackedFunc; @@ -986,24 +986,20 @@ export class Instance implements Disposable { } /** - * Create a new graph runtime. + * Create a new graph executor. * - * @param graphJson The graph runtime json file. + * @param graphJson The graph executor json file. * @param lib The underlying library. * @param dev The execution device of the graph. */ - createGraphRuntime( - graphJson: string, - lib: Module, - dev: DLDevice - ): GraphRuntime { - const fcreate = this.getGlobalFunc("tvm.graph_runtime.create"); + createGraphExecutor(graphJson: string, lib: Module, dev: DLDevice): GraphExecutor { + const fcreate = this.getGlobalFunc('tvm.graph_executor.create'); const module = fcreate( graphJson, lib, this.scalar(dev.deviceType, "int32"), this.scalar(dev.deviceId, "int32")) as Module; - return new GraphRuntime(module); + return new GraphExecutor(module); }