From 40207302610ab858dd5eaac37fd83a8f032c1f2e Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 1 Jan 2021 22:20:39 +0900 Subject: [PATCH 01/16] improve scatter 4d init --- python/tvm/topi/cuda/scatter.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index be602c8ab7a3..54c8220f5f91 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -312,19 +312,17 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) with ib.new_scope(): - i = te.thread_axis("blockIdx.x") - ib.scope_attr(i, "thread_extent", n) - j = te.thread_axis("blockIdx.y") - ib.scope_attr(j, "thread_extent", c) - k = te.thread_axis("blockIdx.z") - ib.scope_attr(k, "thread_extent", h) + fused = n * c * h * w + num_thread = 1024 + num_blocks = ceil_div(fused, num_thread) + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", num_blocks) tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_: - l = l_ * warp_size + tx - with ib.if_scope(l < w): - idx = ((i * c + j) * h + k) * w + l - out_ptr[idx] = data_ptr[idx] + ib.scope_attr(tx, "thread_extent", num_thread) + tid = bx * num_thread + tx + + with ib.if_scope(tid < fused): + out_ptr[tid] = data_ptr[tid] indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) From cb0e21e98c4af604402b47868dd8aabb4b2c1d56 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 1 Jan 2021 22:20:58 +0900 Subject: [PATCH 02/16] do not launch sorting based scatter for small input --- python/tvm/topi/cuda/scatter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 54c8220f5f91..721314aa4fad 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -21,6 +21,7 @@ from ..scatter import _verify_scatter_nd_inputs from .nms import atomic_add from .sort import stable_sort_by_key_thrust, is_thrust_available +from ..utils import get_const_int def ceil_div(a, b): @@ -547,7 +548,12 @@ def update_func(dst_ptr, dst_index, update): in_bufs = [data] - if rank == 1 and is_thrust_available(): + def is_small_scatter(dim): + if isinstance(data.shape[0], tvm.tir.expr.Any): + return False + return get_const_int(dim) < 50 + + if rank == 1 and is_thrust_available() and not is_small_scatter(indices.shape[0]): ir_funcs[1] = gen_scatter_1d_thrust indices_sorted, updates_sorted = stable_sort_by_key_thrust( indices, updates, for_scatter=True From 1b7610932cc25d1aa00c7d40033911229221d4b4 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 9 Jan 2021 06:08:33 +0900 Subject: [PATCH 03/16] do not use hard coded num threads --- python/tvm/topi/cuda/scatter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 721314aa4fad..9cd0eeab7f92 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -314,8 +314,9 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): data_ptr = ib.buffer_ptr(data) with ib.new_scope(): fused = n * c * h * w - num_thread = 1024 + num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads) num_blocks = ceil_div(fused, num_thread) + bx = te.thread_axis("blockIdx.x") ib.scope_attr(bx, "thread_extent", num_blocks) tx = te.thread_axis("threadIdx.x") From d60acbea09c74609a974bcbe430c5cc0538155c7 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 06:55:05 +0900 Subject: [PATCH 04/16] separate sort based implementation --- python/tvm/relay/op/strategy/cuda.py | 13 ++++ python/tvm/topi/cuda/scatter.py | 100 +++++++++++++++++---------- 2 files changed, 75 insertions(+), 38 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 04c16ddd344c..781d6b5cb8e7 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -787,6 +787,19 @@ def scatter_cuda(attrs, inputs, out_type, target): name="scatter.cuda", plevel=10, ) + + rank = len(inputs[0].shape) + + with SpecializedCondition(rank == 1): + if target.kind.name == "cuda" and get_global_func( + "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True + ): + strategy.add_implementation( + wrap_compute_sort(topi.cuda.scatter1d_via_sort), + wrap_topi_schedule(topi.cuda.schedule_extern), + name="scatter_thrust.cuda", + plevel=9, # use the sequential version by default + ) return strategy diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 9cd0eeab7f92..4481ab51323b 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -417,7 +417,63 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): return ib.get() -def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): +def scatter(data, indices, updates, axis=0): + """Update data at positions defined by indices with values in updates + + Parameters + ---------- + data : relay.Expr + The input data to the operator. + + indices : relay.Expr + The index locations to update. + + updates : relay.Expr + The values to update. + + axis : int + The axis to scatter on + + Returns + ------- + ret : relay.Expr + The computed result. + """ + if axis < 0: + axis += len(data.shape) + assert axis >= 0 + assert axis < len(data.shape) + + rank = len(data.shape) + assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions" + + ir_funcs = { + 1: gen_ir_1d, + 2: gen_ir_2d, + 3: gen_ir_3d, + 4: gen_ir_4d, + } + + def update_func(dst_ptr, dst_index, update): + dst_ptr[dst_index] = update + + out_shape = data.shape + out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") + + out = te.extern( + [out_shape], + [data, indices, updates], + lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func), + dtype=data.dtype, + out_buffers=[out_buf], + name="scatter_gpu", + tag="scatter_gpu", + ) + + return out + + +def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out): """Generate scatter ir for 1d inputs, using a sorting based approach. By sorting indices and comparing neighboring two indices, we can tell which of elements in the indices tensor can scatter its update value into the output. @@ -438,9 +494,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): updates : tir.Tensor The values to update, sorted by indices. - axis : int - The axis to scatter on. It must be 0 for this function. - out : tir.Tensor The output tensor. @@ -449,7 +502,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): ret : tir The computational ir. """ - assert axis == 0 n = data.shape[0] ib = tvm.tir.ir_builder.create() @@ -504,7 +556,7 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): return ib.get() -def scatter(data, indices, updates, axis=0): +def scatter1d_via_sort(data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -528,45 +580,17 @@ def scatter(data, indices, updates, axis=0): """ if axis < 0: axis += len(data.shape) - assert axis >= 0 - assert axis < len(data.shape) - - rank = len(data.shape) - assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions" - - ir_funcs = { - 1: gen_ir_1d, - 2: gen_ir_2d, - 3: gen_ir_3d, - 4: gen_ir_4d, - } - - def update_func(dst_ptr, dst_index, update): - dst_ptr[dst_index] = update + assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") - in_bufs = [data] - - def is_small_scatter(dim): - if isinstance(data.shape[0], tvm.tir.expr.Any): - return False - return get_const_int(dim) < 50 - - if rank == 1 and is_thrust_available() and not is_small_scatter(indices.shape[0]): - ir_funcs[1] = gen_scatter_1d_thrust - indices_sorted, updates_sorted = stable_sort_by_key_thrust( - indices, updates, for_scatter=True - ) - in_bufs += [indices_sorted, updates_sorted] - else: - in_bufs += [indices, updates] + indices_sorted, updates_sorted = stable_sort_by_key_thrust(indices, updates, for_scatter=True) out = te.extern( [out_shape], - in_bufs, - lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func), + [data, indices_sorted, updates_sorted], + lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]), dtype=data.dtype, out_buffers=[out_buf], name="scatter_gpu", From b6d65410e9d2852cbe9f3cfe1091c4742664716d Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 10:23:37 +0900 Subject: [PATCH 05/16] register scatter as autotvm task --- python/tvm/relay/op/strategy/cuda.py | 2 +- python/tvm/topi/cuda/scatter.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 781d6b5cb8e7..3e1f43c1f0b0 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -797,7 +797,7 @@ def scatter_cuda(attrs, inputs, out_type, target): strategy.add_implementation( wrap_compute_sort(topi.cuda.scatter1d_via_sort), wrap_topi_schedule(topi.cuda.schedule_extern), - name="scatter_thrust.cuda", + name="scatter_via_sort.cuda", plevel=9, # use the sequential version by default ) return strategy diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 4481ab51323b..50a089255bb6 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -21,7 +21,6 @@ from ..scatter import _verify_scatter_nd_inputs from .nms import atomic_add from .sort import stable_sort_by_key_thrust, is_thrust_available -from ..utils import get_const_int def ceil_div(a, b): @@ -417,7 +416,8 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): return ib.get() -def scatter(data, indices, updates, axis=0): +@autotvm.register_topi_compute("scatter.cuda") +def scatter(_, data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -556,7 +556,8 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out): return ib.get() -def scatter1d_via_sort(data, indices, updates, axis=0): +@autotvm.register_topi_compute("scatter_via_sort.cuda") +def scatter1d_via_sort(_, data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -581,6 +582,7 @@ def scatter1d_via_sort(data, indices, updates, axis=0): if axis < 0: axis += len(data.shape) assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" + assert is_thrust_available(), "Thrust is required for this op" out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") @@ -593,8 +595,8 @@ def scatter1d_via_sort(data, indices, updates, axis=0): lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]), dtype=data.dtype, out_buffers=[out_buf], - name="scatter_gpu", - tag="scatter_gpu", + name="scatter_via_sort_gpu", + tag="scatter_via_sort_gpu", ) return out From 9c87d8d20cd23284fd8a4464cd2c85233ad1e23f Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 10:55:05 +0900 Subject: [PATCH 06/16] add missing import --- python/tvm/topi/cuda/scatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 50a089255bb6..205ddf5f0dce 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -17,7 +17,7 @@ # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument """Scatter operator """ import tvm -from tvm import te +from tvm import te, autotvm from ..scatter import _verify_scatter_nd_inputs from .nms import atomic_add from .sort import stable_sort_by_key_thrust, is_thrust_available From 9b077db64b6af263e51221c95cf69de988c7badd Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 11:41:13 +0900 Subject: [PATCH 07/16] fix strategy --- python/tvm/relay/op/strategy/cuda.py | 4 ++-- python/tvm/relay/op/strategy/generic.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 3e1f43c1f0b0..7e2b29dc2286 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -795,8 +795,8 @@ def scatter_cuda(attrs, inputs, out_type, target): "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True ): strategy.add_implementation( - wrap_compute_sort(topi.cuda.scatter1d_via_sort), - wrap_topi_schedule(topi.cuda.schedule_extern), + wrap_compute_scatter(topi.cuda.scatter1d_via_sort), + wrap_topi_schedule(topi.generic.schedule_extern), name="scatter_via_sort.cuda", plevel=9, # use the sequential version by default ) diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 363832ef8b2f..8dd9dc5844dd 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -1123,7 +1123,7 @@ def wrap_compute_scatter(topi_compute): """Wrap scatter topi compute""" def _compute_scatter(attrs, inputs, _): - return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)] + return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.axis)] return _compute_scatter From 3090af8e87d73442e93b15c21c3e4937996d7b4b Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 12:06:53 +0900 Subject: [PATCH 08/16] add dedicated schedule and dummy flop --- python/tvm/relay/op/strategy/cuda.py | 6 +++--- python/tvm/topi/cuda/scatter.py | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 7e2b29dc2286..3863df0fd831 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -783,7 +783,7 @@ def scatter_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_scatter(topi.cuda.scatter), - wrap_topi_schedule(topi.generic.schedule_extern), + wrap_topi_schedule(topi.cuda.schedule_scatter), name="scatter.cuda", plevel=10, ) @@ -795,8 +795,8 @@ def scatter_cuda(attrs, inputs, out_type, target): "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True ): strategy.add_implementation( - wrap_compute_scatter(topi.cuda.scatter1d_via_sort), - wrap_topi_schedule(topi.generic.schedule_extern), + wrap_compute_scatter(topi.cuda.scatter_via_sort), + wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort), name="scatter_via_sort.cuda", plevel=9, # use the sequential version by default ) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 205ddf5f0dce..b9bf1442533d 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -19,6 +19,7 @@ import tvm from tvm import te, autotvm from ..scatter import _verify_scatter_nd_inputs +from ..generic import schedule_extern from .nms import atomic_add from .sort import stable_sort_by_key_thrust, is_thrust_available @@ -417,7 +418,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): @autotvm.register_topi_compute("scatter.cuda") -def scatter(_, data, indices, updates, axis=0): +def scatter(cfg, data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -460,6 +461,8 @@ def update_func(dst_ptr, dst_index, update): out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") + cfg.add_flop(1) + out = te.extern( [out_shape], [data, indices, updates], @@ -473,6 +476,11 @@ def update_func(dst_ptr, dst_index, update): return out +@autotvm.register_topi_schedule("scatter.cuda") +def schedule_scatter(_, outs): + return schedule_extern(outs) + + def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out): """Generate scatter ir for 1d inputs, using a sorting based approach. By sorting indices and comparing neighboring two indices, we can tell which @@ -557,7 +565,7 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out): @autotvm.register_topi_compute("scatter_via_sort.cuda") -def scatter1d_via_sort(_, data, indices, updates, axis=0): +def scatter_via_sort(cfg, data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -584,6 +592,8 @@ def scatter1d_via_sort(_, data, indices, updates, axis=0): assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" assert is_thrust_available(), "Thrust is required for this op" + cfg.add_flop(1) + out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") @@ -602,6 +612,11 @@ def scatter1d_via_sort(_, data, indices, updates, axis=0): return out +@autotvm.register_topi_schedule("scatter_via_sort.cuda") +def schedule_scatter_via_sort(_, outs): + return schedule_extern(outs) + + def gen_scatter_add_1d_atomic(data, indices, updates, axis, out, _): """Generate scatter add ir for 1d inputs, using atomic_add instruction From 28c08dcdd04220546c0ed4332574821e33bcf842 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 11 Jan 2021 12:13:55 +0900 Subject: [PATCH 09/16] add test tuning script --- tutorials/autotvm/test_scatter.py | 135 ++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 tutorials/autotvm/test_scatter.py diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py new file mode 100644 index 000000000000..c8899a517692 --- /dev/null +++ b/tutorials/autotvm/test_scatter.py @@ -0,0 +1,135 @@ +import os + +import numpy as np +import logging +import tvm +from tvm import relay, autotvm +import tvm.relay.testing +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +import tvm.contrib.graph_runtime as runtime + + +def simple_mod(dshape, ishape, axis=0): + d = relay.var("d", relay.TensorType(dshape, "float32")) + i = relay.var("i", relay.TensorType(ishape, "int64")) + u = relay.var("u", relay.TensorType(ishape, "float32")) + z = relay.op.scatter(d, i, u, axis) + func = relay.Function([d, i, u], z) + mod = tvm.IRModule() + mod["main"] = func + return mod, {} + + +target = "cuda" + +#### TUNING OPTION #### +network = "scatter" +log_file = "%s.log" % network + +tuning_option = { + "log_filename": log_file, + "tuner": "xgb", + "n_trial": 10, + "early_stopping": 10, + "measure_option": autotvm.measure_option( + builder=autotvm.LocalBuilder(timeout=10), + runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), + ), +} + +def tune_tasks( + tasks, + measure_option, + tuner="xgb", + n_trial=1000, + early_stopping=None, + log_filename="tuning.log", + use_transfer_learning=True, +): + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + + # create tuner + if tuner == "xgb" or tuner == "xgb-rank": + tuner_obj = XGBTuner(tsk, loss_type="rank") + elif tuner == "ga": + tuner_obj = GATuner(tsk, pop_size=100) + elif tuner == "random": + tuner_obj = RandomTuner(tsk) + elif tuner == "gridsearch": + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + tsk_trial = min(n_trial, len(tsk.config_space)) + tuner_obj.tune( + n_trial=tsk_trial, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(tsk_trial, prefix=prefix), + autotvm.callback.log_to_file(tmp_log_file), + ], + ) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + +def tune_and_evaluate(tuning_opt): + # extract workloads from relay program + print("Extract tasks...") + size = (5000, ) + dshape = ishape = size + axis = 0 + mod, params = simple_mod(size, size, axis) + tasks = autotvm.task.extract_from_program( + mod["main"], target=target, params=params, ops=(relay.op.get("scatter"),) + ) + + # run tuning tasks + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + # compile kernels with history best records + with autotvm.apply_history_best(log_file): + print("Compile...") + with tvm.transform.PassContext(opt_level=3): + lib = relay.build_module.build(mod, target=target, params=params) + + # load parameters + ctx = tvm.context(str(target), 0) + module = runtime.GraphModule(lib["default"](ctx)) + + data_np = np.random.uniform(size=dshape).astype("float32") + updates_np = np.random.uniform(size=ishape).astype("float32") + indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64") + + module.set_input("d", data_np) + module.set_input("i", indices_np) + module.set_input("u", updates_np) + + # evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) + prof_res = np.array(ftimer().results) * 1000 # convert to millisecond + print( + "Mean inference time (std dev): %.2f ms (%.2f ms)" + % (np.mean(prof_res), np.std(prof_res)) + ) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. +logging.basicConfig(level=logging.DEBUG) +tune_and_evaluate(tuning_option) From bf27ba8cacd8ffef05a339e0ece2e1f7e391f6f6 Mon Sep 17 00:00:00 2001 From: masa Date: Tue, 12 Jan 2021 06:55:20 +0900 Subject: [PATCH 10/16] try adding dummy knob --- python/tvm/topi/cuda/scatter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index b9bf1442533d..aaafb5cb0db3 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -461,7 +461,8 @@ def update_func(dst_ptr, dst_index, update): out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") - cfg.add_flop(1) + cfg.define_knob("dummy", [1]) + cfg.add_flop(1) # dummy value to satisfy AutoTVM out = te.extern( [out_shape], @@ -592,6 +593,7 @@ def scatter_via_sort(cfg, data, indices, updates, axis=0): assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" assert is_thrust_available(), "Thrust is required for this op" + cfg.define_knob("dummy", [1]) cfg.add_flop(1) out_shape = data.shape From a876443e47387e7e3bf4b3168b6d26c419a398f7 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 18 Jan 2021 21:52:28 +0900 Subject: [PATCH 11/16] skip random_fill when a tuning workload is from scatter This reverts commit 1fed88321e640b509fc46fac7da3b3cb79719552. --- python/tvm/autotvm/measure/measure_methods.py | 6 +++--- python/tvm/topi/cuda/scatter.py | 6 ++---- tutorials/autotvm/test_scatter.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index cb801ba72872..8e7fb3c6bed1 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -561,9 +561,9 @@ def run_through_rpc( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] - for arg in args: - random_fill(arg) - ctx.sync() + if "scatter" not in measure_input.task.name: + for arg in args: + random_fill(arg) costs = time_f(*args).results diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index aaafb5cb0db3..b111ddb6b2ed 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -461,8 +461,7 @@ def update_func(dst_ptr, dst_index, update): out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") - cfg.define_knob("dummy", [1]) - cfg.add_flop(1) # dummy value to satisfy AutoTVM + cfg.add_flop(1) # A dummy value to satisfy AutoTVM out = te.extern( [out_shape], @@ -593,8 +592,7 @@ def scatter_via_sort(cfg, data, indices, updates, axis=0): assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" assert is_thrust_available(), "Thrust is required for this op" - cfg.define_knob("dummy", [1]) - cfg.add_flop(1) + cfg.add_flop(1) # A dummy value to satisfy AutoTVM out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py index c8899a517692..108921dfed86 100644 --- a/tutorials/autotvm/test_scatter.py +++ b/tutorials/autotvm/test_scatter.py @@ -89,7 +89,7 @@ def tune_tasks( def tune_and_evaluate(tuning_opt): # extract workloads from relay program print("Extract tasks...") - size = (5000, ) + size = (10000, ) dshape = ishape = size axis = 0 mod, params = simple_mod(size, size, axis) From 5cfd32e5c21f9fd6f513675bc8144bf87539eea5 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 18 Jan 2021 22:26:27 +0900 Subject: [PATCH 12/16] cleanup memcpy ir --- python/tvm/topi/cuda/scatter.py | 61 +++++++++++---------------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index b111ddb6b2ed..b34bd1df14e4 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -22,12 +22,28 @@ from ..generic import schedule_extern from .nms import atomic_add from .sort import stable_sort_by_key_thrust, is_thrust_available +from ..utils import prod def ceil_div(a, b): return (a + b - 1) // b +def _memcpy_ir(ib, out_ptr, data_ptr, shape): + fused = prod(shape) + with ib.new_scope(): + num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads) + num_blocks = ceil_div(fused, num_thread) + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", num_blocks) + tx = te.thread_axis("threadIdx.x") + ib.scope_attr(tx, "thread_extent", num_thread) + tid = bx * num_thread + tx + + with ib.if_scope(tid < fused): + out_ptr[tid] = data_ptr[tid] + + def gen_ir_1d(data, indices, updates, axis, out, update_func): """Generate scatter ir for 1d inputs @@ -64,10 +80,7 @@ def gen_ir_1d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - out_ptr[bx] = data_ptr[bx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -115,8 +128,6 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func): ret : tir The computational ir. """ - warp_size = tvm.target.Target.current(False).thread_warp_size - n = data.shape[0] c = data.shape[1] @@ -125,16 +136,7 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_: - j = j_ * warp_size + tx - with ib.if_scope(j < c): - idx = bx * c + j - out_ptr[idx] = data_ptr[idx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -206,18 +208,7 @@ def gen_ir_3d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - by = te.thread_axis("blockIdx.y") - ib.scope_attr(by, "thread_extent", c) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_: - k = k_ * warp_size + tx - with ib.if_scope(k < h): - idx = (bx * c + by) * h + k - out_ptr[idx] = data_ptr[idx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -312,19 +303,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - fused = n * c * h * w - num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads) - num_blocks = ceil_div(fused, num_thread) - - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", num_blocks) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", num_thread) - tid = bx * num_thread + tx - - with ib.if_scope(tid < fused): - out_ptr[tid] = data_ptr[tid] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) From 528f755ae4d1a35a1c4b913ab90ea3eddd60b1db Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 18 Jan 2021 22:29:16 +0900 Subject: [PATCH 13/16] remove scatter tuning script --- tutorials/autotvm/test_scatter.py | 135 ------------------------------ 1 file changed, 135 deletions(-) delete mode 100644 tutorials/autotvm/test_scatter.py diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py deleted file mode 100644 index 108921dfed86..000000000000 --- a/tutorials/autotvm/test_scatter.py +++ /dev/null @@ -1,135 +0,0 @@ -import os - -import numpy as np -import logging -import tvm -from tvm import relay, autotvm -import tvm.relay.testing -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -import tvm.contrib.graph_runtime as runtime - - -def simple_mod(dshape, ishape, axis=0): - d = relay.var("d", relay.TensorType(dshape, "float32")) - i = relay.var("i", relay.TensorType(ishape, "int64")) - u = relay.var("u", relay.TensorType(ishape, "float32")) - z = relay.op.scatter(d, i, u, axis) - func = relay.Function([d, i, u], z) - mod = tvm.IRModule() - mod["main"] = func - return mod, {} - - -target = "cuda" - -#### TUNING OPTION #### -network = "scatter" -log_file = "%s.log" % network - -tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 10, - "early_stopping": 10, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(timeout=10), - runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), - ), -} - -def tune_tasks( - tasks, - measure_option, - tuner="xgb", - n_trial=1000, - early_stopping=None, - log_filename="tuning.log", - use_transfer_learning=True, -): - # create tmp log file - tmp_log_file = log_filename + ".tmp" - if os.path.exists(tmp_log_file): - os.remove(tmp_log_file) - - for i, tsk in enumerate(reversed(tasks)): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb" or tuner == "xgb-rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=100) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - if use_transfer_learning: - if os.path.isfile(tmp_log_file): - tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) - - # do tuning - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(tmp_log_file), - ], - ) - - # pick best records to a cache file - autotvm.record.pick_best(tmp_log_file, log_filename) - os.remove(tmp_log_file) - -def tune_and_evaluate(tuning_opt): - # extract workloads from relay program - print("Extract tasks...") - size = (10000, ) - dshape = ishape = size - axis = 0 - mod, params = simple_mod(size, size, axis) - tasks = autotvm.task.extract_from_program( - mod["main"], target=target, params=params, ops=(relay.op.get("scatter"),) - ) - - # run tuning tasks - print("Tuning...") - tune_tasks(tasks, **tuning_opt) - - # compile kernels with history best records - with autotvm.apply_history_best(log_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - - # load parameters - ctx = tvm.context(str(target), 0) - module = runtime.GraphModule(lib["default"](ctx)) - - data_np = np.random.uniform(size=dshape).astype("float32") - updates_np = np.random.uniform(size=ishape).astype("float32") - indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64") - - module.set_input("d", data_np) - module.set_input("i", indices_np) - module.set_input("u", updates_np) - - # evaluate - print("Evaluate inference time cost...") - ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) - prof_res = np.array(ftimer().results) * 1000 # convert to millisecond - print( - "Mean inference time (std dev): %.2f ms (%.2f ms)" - % (np.mean(prof_res), np.std(prof_res)) - ) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. -logging.basicConfig(level=logging.DEBUG) -tune_and_evaluate(tuning_option) From 4f844f50369609129d0f8196ac23b743d8e142b1 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 19 Jan 2021 07:59:28 +0900 Subject: [PATCH 14/16] make sure zero init arguments --- python/tvm/autotvm/measure/measure_methods.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 8e7fb3c6bed1..38c241a969d4 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -30,6 +30,7 @@ from random import getrandbits from collections import namedtuple import tempfile +import numpy as np import tvm._ffi import tvm.ir.transform @@ -560,7 +561,7 @@ def run_through_rpc( raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) - args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] + args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info] if "scatter" not in measure_input.task.name: for arg in args: random_fill(arg) From 6e3acff9f3f97daa5d204423eaf1373c34b0ab45 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 19 Jan 2021 08:01:43 +0900 Subject: [PATCH 15/16] add comment on why skip random init for scatter --- python/tvm/autotvm/measure/measure_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 38c241a969d4..76436153df09 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -563,6 +563,7 @@ def run_through_rpc( ) args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info] if "scatter" not in measure_input.task.name: + # the index tensor of scatter op cannot be randomly initialized for arg in args: random_fill(arg) From 7b1a6ea704c4aa153b8234a92720edd637500def Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 19 Jan 2021 11:07:19 +0900 Subject: [PATCH 16/16] restore ctx sync --- python/tvm/autotvm/measure/measure_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 76436153df09..ffe4b97e33db 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -566,6 +566,7 @@ def run_through_rpc( # the index tensor of scatter op cannot be randomly initialized for arg in args: random_fill(arg) + ctx.sync() costs = time_f(*args).results