From 40207302610ab858dd5eaac37fd83a8f032c1f2e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 1 Jan 2021 22:20:39 +0900
Subject: [PATCH 01/16] improve scatter 4d init

---
 python/tvm/topi/cuda/scatter.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index be602c8ab7a3..54c8220f5f91 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -312,19 +312,17 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
     with ib.new_scope():
-        i = te.thread_axis("blockIdx.x")
-        ib.scope_attr(i, "thread_extent", n)
-        j = te.thread_axis("blockIdx.y")
-        ib.scope_attr(j, "thread_extent", c)
-        k = te.thread_axis("blockIdx.z")
-        ib.scope_attr(k, "thread_extent", h)
+        fused = n * c * h * w
+        num_thread = 1024
+        num_blocks = ceil_div(fused, num_thread)
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", num_blocks)
         tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_:
-            l = l_ * warp_size + tx
-            with ib.if_scope(l < w):
-                idx = ((i * c + j) * h + k) * w + l
-                out_ptr[idx] = data_ptr[idx]
+        ib.scope_attr(tx, "thread_extent", num_thread)
+        tid = bx * num_thread + tx
+
+        with ib.if_scope(tid < fused):
+            out_ptr[tid] = data_ptr[tid]
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)

From cb0e21e98c4af604402b47868dd8aabb4b2c1d56 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 1 Jan 2021 22:20:58 +0900
Subject: [PATCH 02/16] do not launch sorting based scatter for small input

---
 python/tvm/topi/cuda/scatter.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 54c8220f5f91..721314aa4fad 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -21,6 +21,7 @@
 from ..scatter import _verify_scatter_nd_inputs
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
+from ..utils import get_const_int
 
 
 def ceil_div(a, b):
@@ -547,7 +548,12 @@ def update_func(dst_ptr, dst_index, update):
 
     in_bufs = [data]
 
-    if rank == 1 and is_thrust_available():
+    def is_small_scatter(dim):
+        if isinstance(data.shape[0], tvm.tir.expr.Any):
+            return False
+        return get_const_int(dim) < 50
+
+    if rank == 1 and is_thrust_available() and not is_small_scatter(indices.shape[0]):
         ir_funcs[1] = gen_scatter_1d_thrust
         indices_sorted, updates_sorted = stable_sort_by_key_thrust(
             indices, updates, for_scatter=True

From 1b7610932cc25d1aa00c7d40033911229221d4b4 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 9 Jan 2021 06:08:33 +0900
Subject: [PATCH 03/16] do not use hard coded num threads

---
 python/tvm/topi/cuda/scatter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 721314aa4fad..9cd0eeab7f92 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -314,8 +314,9 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
     data_ptr = ib.buffer_ptr(data)
     with ib.new_scope():
         fused = n * c * h * w
-        num_thread = 1024
+        num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads)
         num_blocks = ceil_div(fused, num_thread)
+
         bx = te.thread_axis("blockIdx.x")
         ib.scope_attr(bx, "thread_extent", num_blocks)
         tx = te.thread_axis("threadIdx.x")

From d60acbea09c74609a974bcbe430c5cc0538155c7 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 06:55:05 +0900
Subject: [PATCH 04/16] separate sort based implementation

---
 python/tvm/relay/op/strategy/cuda.py |  13 ++++
 python/tvm/topi/cuda/scatter.py      | 100 +++++++++++++++++----------
 2 files changed, 75 insertions(+), 38 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 04c16ddd344c..781d6b5cb8e7 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -787,6 +787,19 @@ def scatter_cuda(attrs, inputs, out_type, target):
         name="scatter.cuda",
         plevel=10,
     )
+
+    rank = len(inputs[0].shape)
+
+    with SpecializedCondition(rank == 1):
+        if target.kind.name == "cuda" and get_global_func(
+            "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True
+        ):
+            strategy.add_implementation(
+                wrap_compute_sort(topi.cuda.scatter1d_via_sort),
+                wrap_topi_schedule(topi.cuda.schedule_extern),
+                name="scatter_thrust.cuda",
+                plevel=9,  # use the sequential version by default
+            )
     return strategy
 
 
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 9cd0eeab7f92..4481ab51323b 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -417,7 +417,63 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
     return ib.get()
 
 
-def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
+def scatter(data, indices, updates, axis=0):
+    """Update data at positions defined by indices with values in updates
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] = update
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_gpu",
+        tag="scatter_gpu",
+    )
+
+    return out
+
+
+def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
     """Generate scatter ir for 1d inputs, using a sorting based approach.
     By sorting indices and comparing neighboring two indices, we can tell which
     of elements in the indices tensor can scatter its update value into the output.
@@ -438,9 +494,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     updates : tir.Tensor
         The values to update, sorted by indices.
 
-    axis : int
-        The axis to scatter on. It must be 0 for this function.
-
     out : tir.Tensor
         The output tensor.
 
@@ -449,7 +502,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     ret : tir
         The computational ir.
     """
-    assert axis == 0
     n = data.shape[0]
 
     ib = tvm.tir.ir_builder.create()
@@ -504,7 +556,7 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     return ib.get()
 
 
-def scatter(data, indices, updates, axis=0):
+def scatter1d_via_sort(data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -528,45 +580,17 @@ def scatter(data, indices, updates, axis=0):
     """
     if axis < 0:
         axis += len(data.shape)
-    assert axis >= 0
-    assert axis < len(data.shape)
-
-    rank = len(data.shape)
-    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
-
-    ir_funcs = {
-        1: gen_ir_1d,
-        2: gen_ir_2d,
-        3: gen_ir_3d,
-        4: gen_ir_4d,
-    }
-
-    def update_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = update
+    assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
 
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
-    in_bufs = [data]
-
-    def is_small_scatter(dim):
-        if isinstance(data.shape[0], tvm.tir.expr.Any):
-            return False
-        return get_const_int(dim) < 50
-
-    if rank == 1 and is_thrust_available() and not is_small_scatter(indices.shape[0]):
-        ir_funcs[1] = gen_scatter_1d_thrust
-        indices_sorted, updates_sorted = stable_sort_by_key_thrust(
-            indices, updates, for_scatter=True
-        )
-        in_bufs += [indices_sorted, updates_sorted]
-    else:
-        in_bufs += [indices, updates]
+    indices_sorted, updates_sorted = stable_sort_by_key_thrust(indices, updates, for_scatter=True)
 
     out = te.extern(
         [out_shape],
-        in_bufs,
-        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        [data, indices_sorted, updates_sorted],
+        lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]),
         dtype=data.dtype,
         out_buffers=[out_buf],
         name="scatter_gpu",

From b6d65410e9d2852cbe9f3cfe1091c4742664716d Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 10:23:37 +0900
Subject: [PATCH 05/16] register scatter as autotvm task

---
 python/tvm/relay/op/strategy/cuda.py |  2 +-
 python/tvm/topi/cuda/scatter.py      | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 781d6b5cb8e7..3e1f43c1f0b0 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -797,7 +797,7 @@ def scatter_cuda(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_sort(topi.cuda.scatter1d_via_sort),
                 wrap_topi_schedule(topi.cuda.schedule_extern),
-                name="scatter_thrust.cuda",
+                name="scatter_via_sort.cuda",
                 plevel=9,  # use the sequential version by default
             )
     return strategy
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 4481ab51323b..50a089255bb6 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -21,7 +21,6 @@
 from ..scatter import _verify_scatter_nd_inputs
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
-from ..utils import get_const_int
 
 
 def ceil_div(a, b):
@@ -417,7 +416,8 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
     return ib.get()
 
 
-def scatter(data, indices, updates, axis=0):
+@autotvm.register_topi_compute("scatter.cuda")
+def scatter(_, data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -556,7 +556,8 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
     return ib.get()
 
 
-def scatter1d_via_sort(data, indices, updates, axis=0):
+@autotvm.register_topi_compute("scatter_via_sort.cuda")
+def scatter1d_via_sort(_, data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -581,6 +582,7 @@ def scatter1d_via_sort(data, indices, updates, axis=0):
     if axis < 0:
         axis += len(data.shape)
     assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
+    assert is_thrust_available(), "Thrust is required for this op"
 
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
@@ -593,8 +595,8 @@ def scatter1d_via_sort(data, indices, updates, axis=0):
         lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]),
         dtype=data.dtype,
         out_buffers=[out_buf],
-        name="scatter_gpu",
-        tag="scatter_gpu",
+        name="scatter_via_sort_gpu",
+        tag="scatter_via_sort_gpu",
     )
 
     return out

From 9c87d8d20cd23284fd8a4464cd2c85233ad1e23f Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 10:55:05 +0900
Subject: [PATCH 06/16] add missing import

---
 python/tvm/topi/cuda/scatter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 50a089255bb6..205ddf5f0dce 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Scatter operator """
 import tvm
-from tvm import te
+from tvm import te, autotvm
 from ..scatter import _verify_scatter_nd_inputs
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available

From 9b077db64b6af263e51221c95cf69de988c7badd Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 11:41:13 +0900
Subject: [PATCH 07/16] fix strategy

---
 python/tvm/relay/op/strategy/cuda.py    | 4 ++--
 python/tvm/relay/op/strategy/generic.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3e1f43c1f0b0..7e2b29dc2286 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -795,8 +795,8 @@ def scatter_cuda(attrs, inputs, out_type, target):
             "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True
         ):
             strategy.add_implementation(
-                wrap_compute_sort(topi.cuda.scatter1d_via_sort),
-                wrap_topi_schedule(topi.cuda.schedule_extern),
+                wrap_compute_scatter(topi.cuda.scatter1d_via_sort),
+                wrap_topi_schedule(topi.generic.schedule_extern),
                 name="scatter_via_sort.cuda",
                 plevel=9,  # use the sequential version by default
             )
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 363832ef8b2f..8dd9dc5844dd 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1123,7 +1123,7 @@ def wrap_compute_scatter(topi_compute):
     """Wrap scatter topi compute"""
 
     def _compute_scatter(attrs, inputs, _):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)]
+        return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
     return _compute_scatter
 

From 3090af8e87d73442e93b15c21c3e4937996d7b4b Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 12:06:53 +0900
Subject: [PATCH 08/16] add dedicated schedule and dummy flop

---
 python/tvm/relay/op/strategy/cuda.py |  6 +++---
 python/tvm/topi/cuda/scatter.py      | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 7e2b29dc2286..3863df0fd831 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -783,7 +783,7 @@ def scatter_cuda(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_scatter(topi.cuda.scatter),
-        wrap_topi_schedule(topi.generic.schedule_extern),
+        wrap_topi_schedule(topi.cuda.schedule_scatter),
         name="scatter.cuda",
         plevel=10,
     )
@@ -795,8 +795,8 @@ def scatter_cuda(attrs, inputs, out_type, target):
             "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True
         ):
             strategy.add_implementation(
-                wrap_compute_scatter(topi.cuda.scatter1d_via_sort),
-                wrap_topi_schedule(topi.generic.schedule_extern),
+                wrap_compute_scatter(topi.cuda.scatter_via_sort),
+                wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
                 name="scatter_via_sort.cuda",
                 plevel=9,  # use the sequential version by default
             )
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 205ddf5f0dce..b9bf1442533d 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -19,6 +19,7 @@
 import tvm
 from tvm import te, autotvm
 from ..scatter import _verify_scatter_nd_inputs
+from ..generic import schedule_extern
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
 
@@ -417,7 +418,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
 
 
 @autotvm.register_topi_compute("scatter.cuda")
-def scatter(_, data, indices, updates, axis=0):
+def scatter(cfg, data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -460,6 +461,8 @@ def update_func(dst_ptr, dst_index, update):
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
+    cfg.add_flop(1)
+
     out = te.extern(
         [out_shape],
         [data, indices, updates],
@@ -473,6 +476,11 @@ def update_func(dst_ptr, dst_index, update):
     return out
 
 
+@autotvm.register_topi_schedule("scatter.cuda")
+def schedule_scatter(_, outs):
+    return schedule_extern(outs)
+
+
 def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
     """Generate scatter ir for 1d inputs, using a sorting based approach.
     By sorting indices and comparing neighboring two indices, we can tell which
@@ -557,7 +565,7 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
 
 
 @autotvm.register_topi_compute("scatter_via_sort.cuda")
-def scatter1d_via_sort(_, data, indices, updates, axis=0):
+def scatter_via_sort(cfg, data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -584,6 +592,8 @@ def scatter1d_via_sort(_, data, indices, updates, axis=0):
     assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
     assert is_thrust_available(), "Thrust is required for this op"
 
+    cfg.add_flop(1)
+
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
@@ -602,6 +612,11 @@ def scatter1d_via_sort(_, data, indices, updates, axis=0):
     return out
 
 
+@autotvm.register_topi_schedule("scatter_via_sort.cuda")
+def schedule_scatter_via_sort(_, outs):
+    return schedule_extern(outs)
+
+
 def gen_scatter_add_1d_atomic(data, indices, updates, axis, out, _):
     """Generate scatter add ir for 1d inputs, using atomic_add instruction
 

From 28c08dcdd04220546c0ed4332574821e33bcf842 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 11 Jan 2021 12:13:55 +0900
Subject: [PATCH 09/16] add test tuning script

---
 tutorials/autotvm/test_scatter.py | 135 ++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 tutorials/autotvm/test_scatter.py

diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py
new file mode 100644
index 000000000000..c8899a517692
--- /dev/null
+++ b/tutorials/autotvm/test_scatter.py
@@ -0,0 +1,135 @@
+import os
+
+import numpy as np
+import logging
+import tvm
+from tvm import relay, autotvm
+import tvm.relay.testing
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+import tvm.contrib.graph_runtime as runtime
+
+
+def simple_mod(dshape, ishape, axis=0):
+    d = relay.var("d", relay.TensorType(dshape, "float32"))
+    i = relay.var("i", relay.TensorType(ishape, "int64"))
+    u = relay.var("u", relay.TensorType(ishape, "float32"))
+    z = relay.op.scatter(d, i, u, axis)
+    func = relay.Function([d, i, u], z)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    return mod, {}
+
+
+target = "cuda"
+
+#### TUNING OPTION ####
+network = "scatter"
+log_file = "%s.log" % network
+
+tuning_option = {
+    "log_filename": log_file,
+    "tuner": "xgb",
+    "n_trial": 10,
+    "early_stopping": 10,
+    "measure_option": autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
+    ),
+}
+
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+
+        # create tuner
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "ga":
+            tuner_obj = GATuner(tsk, pop_size=100)
+        elif tuner == "random":
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == "gridsearch":
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from relay program
+    print("Extract tasks...")
+    size = (5000, )
+    dshape = ishape = size
+    axis = 0
+    mod, params = simple_mod(size, size, axis)
+    tasks = autotvm.task.extract_from_program(
+        mod["main"], target=target, params=params, ops=(relay.op.get("scatter"),)
+    )
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build_module.build(mod, target=target, params=params)
+
+        # load parameters
+        ctx = tvm.context(str(target), 0)
+        module = runtime.GraphModule(lib["default"](ctx))
+
+        data_np = np.random.uniform(size=dshape).astype("float32")
+        updates_np = np.random.uniform(size=ishape).astype("float32")
+        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
+
+        module.set_input("d", data_np)
+        module.set_input("i", indices_np)
+        module.set_input("u", updates_np)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+logging.basicConfig(level=logging.DEBUG)
+tune_and_evaluate(tuning_option)

From bf27ba8cacd8ffef05a339e0ece2e1f7e391f6f6 Mon Sep 17 00:00:00 2001
From: masa <masa@pop-os.localdomain>
Date: Tue, 12 Jan 2021 06:55:20 +0900
Subject: [PATCH 10/16] try adding dummy knob

---
 python/tvm/topi/cuda/scatter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index b9bf1442533d..aaafb5cb0db3 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -461,7 +461,8 @@ def update_func(dst_ptr, dst_index, update):
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
-    cfg.add_flop(1)
+    cfg.define_knob("dummy", [1])
+    cfg.add_flop(1) # dummy value to satisfy AutoTVM
 
     out = te.extern(
         [out_shape],
@@ -592,6 +593,7 @@ def scatter_via_sort(cfg, data, indices, updates, axis=0):
     assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
     assert is_thrust_available(), "Thrust is required for this op"
 
+    cfg.define_knob("dummy", [1])
     cfg.add_flop(1)
 
     out_shape = data.shape

From a876443e47387e7e3bf4b3168b6d26c419a398f7 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 18 Jan 2021 21:52:28 +0900
Subject: [PATCH 11/16] skip random_fill when a tuning workload is from scatter

This reverts commit 1fed88321e640b509fc46fac7da3b3cb79719552.
---
 python/tvm/autotvm/measure/measure_methods.py | 6 +++---
 python/tvm/topi/cuda/scatter.py               | 6 ++----
 tutorials/autotvm/test_scatter.py             | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index cb801ba72872..8e7fb3c6bed1 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -561,9 +561,9 @@ def run_through_rpc(
                 "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
             )
         args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
-        for arg in args:
-            random_fill(arg)
-        ctx.sync()
+        if "scatter" not in measure_input.task.name:
+            for arg in args:
+                random_fill(arg)
 
         costs = time_f(*args).results
 
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index aaafb5cb0db3..b111ddb6b2ed 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -461,8 +461,7 @@ def update_func(dst_ptr, dst_index, update):
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
-    cfg.define_knob("dummy", [1])
-    cfg.add_flop(1) # dummy value to satisfy AutoTVM
+    cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
 
     out = te.extern(
         [out_shape],
@@ -593,8 +592,7 @@ def scatter_via_sort(cfg, data, indices, updates, axis=0):
     assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
     assert is_thrust_available(), "Thrust is required for this op"
 
-    cfg.define_knob("dummy", [1])
-    cfg.add_flop(1)
+    cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
 
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py
index c8899a517692..108921dfed86 100644
--- a/tutorials/autotvm/test_scatter.py
+++ b/tutorials/autotvm/test_scatter.py
@@ -89,7 +89,7 @@ def tune_tasks(
 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
-    size = (5000, )
+    size = (10000, )
     dshape = ishape = size
     axis = 0
     mod, params = simple_mod(size, size, axis)

From 5cfd32e5c21f9fd6f513675bc8144bf87539eea5 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 18 Jan 2021 22:26:27 +0900
Subject: [PATCH 12/16] cleanup memcpy ir

---
 python/tvm/topi/cuda/scatter.py | 61 +++++++++++----------------------
 1 file changed, 20 insertions(+), 41 deletions(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index b111ddb6b2ed..b34bd1df14e4 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -22,12 +22,28 @@
 from ..generic import schedule_extern
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
+from ..utils import prod
 
 
 def ceil_div(a, b):
     return (a + b - 1) // b
 
 
+def _memcpy_ir(ib, out_ptr, data_ptr, shape):
+    fused = prod(shape)
+    with ib.new_scope():
+        num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        num_blocks = ceil_div(fused, num_thread)
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", num_blocks)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", num_thread)
+        tid = bx * num_thread + tx
+
+        with ib.if_scope(tid < fused):
+            out_ptr[tid] = data_ptr[tid]
+
+
 def gen_ir_1d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 1d inputs
 
@@ -64,10 +80,7 @@ def gen_ir_1d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        out_ptr[bx] = data_ptr[bx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -115,8 +128,6 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func):
     ret : tir
         The computational ir.
     """
-    warp_size = tvm.target.Target.current(False).thread_warp_size
-
     n = data.shape[0]
     c = data.shape[1]
 
@@ -125,16 +136,7 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_:
-            j = j_ * warp_size + tx
-            with ib.if_scope(j < c):
-                idx = bx * c + j
-                out_ptr[idx] = data_ptr[idx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -206,18 +208,7 @@ def gen_ir_3d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(by, "thread_extent", c)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_:
-            k = k_ * warp_size + tx
-            with ib.if_scope(k < h):
-                idx = (bx * c + by) * h + k
-                out_ptr[idx] = data_ptr[idx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -312,19 +303,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
 
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
-    with ib.new_scope():
-        fused = n * c * h * w
-        num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-        num_blocks = ceil_div(fused, num_thread)
-
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", num_blocks)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", num_thread)
-        tid = bx * num_thread + tx
-
-        with ib.if_scope(tid < fused):
-            out_ptr[tid] = data_ptr[tid]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)

From 528f755ae4d1a35a1c4b913ab90ea3eddd60b1db Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 18 Jan 2021 22:29:16 +0900
Subject: [PATCH 13/16] remove scatter tuning script

---
 tutorials/autotvm/test_scatter.py | 135 ------------------------------
 1 file changed, 135 deletions(-)
 delete mode 100644 tutorials/autotvm/test_scatter.py

diff --git a/tutorials/autotvm/test_scatter.py b/tutorials/autotvm/test_scatter.py
deleted file mode 100644
index 108921dfed86..000000000000
--- a/tutorials/autotvm/test_scatter.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import os
-
-import numpy as np
-import logging
-import tvm
-from tvm import relay, autotvm
-import tvm.relay.testing
-from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-import tvm.contrib.graph_runtime as runtime
-
-
-def simple_mod(dshape, ishape, axis=0):
-    d = relay.var("d", relay.TensorType(dshape, "float32"))
-    i = relay.var("i", relay.TensorType(ishape, "int64"))
-    u = relay.var("u", relay.TensorType(ishape, "float32"))
-    z = relay.op.scatter(d, i, u, axis)
-    func = relay.Function([d, i, u], z)
-    mod = tvm.IRModule()
-    mod["main"] = func
-    return mod, {}
-
-
-target = "cuda"
-
-#### TUNING OPTION ####
-network = "scatter"
-log_file = "%s.log" % network
-
-tuning_option = {
-    "log_filename": log_file,
-    "tuner": "xgb",
-    "n_trial": 10,
-    "early_stopping": 10,
-    "measure_option": autotvm.measure_option(
-        builder=autotvm.LocalBuilder(timeout=10),
-        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
-    ),
-}
-
-def tune_tasks(
-    tasks,
-    measure_option,
-    tuner="xgb",
-    n_trial=1000,
-    early_stopping=None,
-    log_filename="tuning.log",
-    use_transfer_learning=True,
-):
-    # create tmp log file
-    tmp_log_file = log_filename + ".tmp"
-    if os.path.exists(tmp_log_file):
-        os.remove(tmp_log_file)
-
-    for i, tsk in enumerate(reversed(tasks)):
-        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-
-        # create tuner
-        if tuner == "xgb" or tuner == "xgb-rank":
-            tuner_obj = XGBTuner(tsk, loss_type="rank")
-        elif tuner == "ga":
-            tuner_obj = GATuner(tsk, pop_size=100)
-        elif tuner == "random":
-            tuner_obj = RandomTuner(tsk)
-        elif tuner == "gridsearch":
-            tuner_obj = GridSearchTuner(tsk)
-        else:
-            raise ValueError("Invalid tuner: " + tuner)
-
-        if use_transfer_learning:
-            if os.path.isfile(tmp_log_file):
-                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
-
-        # do tuning
-        tsk_trial = min(n_trial, len(tsk.config_space))
-        tuner_obj.tune(
-            n_trial=tsk_trial,
-            early_stopping=early_stopping,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # pick best records to a cache file
-    autotvm.record.pick_best(tmp_log_file, log_filename)
-    os.remove(tmp_log_file)
-
-def tune_and_evaluate(tuning_opt):
-    # extract workloads from relay program
-    print("Extract tasks...")
-    size = (10000, )
-    dshape = ishape = size
-    axis = 0
-    mod, params = simple_mod(size, size, axis)
-    tasks = autotvm.task.extract_from_program(
-        mod["main"], target=target, params=params, ops=(relay.op.get("scatter"),)
-    )
-
-    # run tuning tasks
-    print("Tuning...")
-    tune_tasks(tasks, **tuning_opt)
-
-    # compile kernels with history best records
-    with autotvm.apply_history_best(log_file):
-        print("Compile...")
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build_module.build(mod, target=target, params=params)
-
-        # load parameters
-        ctx = tvm.context(str(target), 0)
-        module = runtime.GraphModule(lib["default"](ctx))
-
-        data_np = np.random.uniform(size=dshape).astype("float32")
-        updates_np = np.random.uniform(size=ishape).astype("float32")
-        indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
-
-        module.set_input("d", data_np)
-        module.set_input("i", indices_np)
-        module.set_input("u", updates_np)
-
-        # evaluate
-        print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
-        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
-        print(
-            "Mean inference time (std dev): %.2f ms (%.2f ms)"
-            % (np.mean(prof_res), np.std(prof_res))
-        )
-
-
-# We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run it by yourself.
-logging.basicConfig(level=logging.DEBUG)
-tune_and_evaluate(tuning_option)

From 4f844f50369609129d0f8196ac23b743d8e142b1 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 19 Jan 2021 07:59:28 +0900
Subject: [PATCH 14/16] make sure zero init arguments

---
 python/tvm/autotvm/measure/measure_methods.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 8e7fb3c6bed1..38c241a969d4 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -30,6 +30,7 @@
 from random import getrandbits
 from collections import namedtuple
 import tempfile
+import numpy as np
 
 import tvm._ffi
 import tvm.ir.transform
@@ -560,7 +561,7 @@ def run_through_rpc(
             raise AttributeError(
                 "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
             )
-        args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+        args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
         if "scatter" not in measure_input.task.name:
             for arg in args:
                 random_fill(arg)

From 6e3acff9f3f97daa5d204423eaf1373c34b0ab45 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 19 Jan 2021 08:01:43 +0900
Subject: [PATCH 15/16] add comment on why skip random init for scatter

---
 python/tvm/autotvm/measure/measure_methods.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 38c241a969d4..76436153df09 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -563,6 +563,7 @@ def run_through_rpc(
             )
         args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
         if "scatter" not in measure_input.task.name:
+            # the index tensor of scatter op cannot be randomly initialized
             for arg in args:
                 random_fill(arg)
 

From 7b1a6ea704c4aa153b8234a92720edd637500def Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 19 Jan 2021 11:07:19 +0900
Subject: [PATCH 16/16] restore ctx sync

---
 python/tvm/autotvm/measure/measure_methods.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 76436153df09..ffe4b97e33db 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -566,6 +566,7 @@ def run_through_rpc(
             # the index tensor of scatter op cannot be randomly initialized
             for arg in args:
                 random_fill(arg)
+        ctx.sync()
 
         costs = time_f(*args).results