From 73e49c11bcfd3b647eec0988d34d6ce0fec2e787 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 25 Nov 2020 11:21:30 -0700
Subject: [PATCH 1/6] Add sort op to relay

---
 python/tvm/relay/op/_algorithm.py             |   4 +
 python/tvm/relay/op/algorithm.py              |  22 ++++
 python/tvm/relay/op/dyn/_algorithm.py         |   1 +
 python/tvm/relay/op/strategy/cuda.py          |  20 ++++
 python/tvm/relay/op/strategy/generic.py       |  24 +++++
 python/tvm/topi/cuda/sort.py                  | 100 ++++++++++++++++++
 python/tvm/topi/generic/sort.py               |  17 +++
 python/tvm/topi/sort.py                       |  44 +++++++-
 src/relay/op/algorithm/sort.cc                |  65 ++++++++++++
 src/runtime/contrib/sort/sort.cc              |  63 ++++++++++-
 .../relay/dyn/test_dynamic_op_level6.py       |   4 +-
 tests/python/relay/test_op_level6.py          |  24 +++++
 tests/python/topi/python/test_topi_sort.py    |  53 ++++++++++
 13 files changed, 436 insertions(+), 5 deletions(-)
 create mode 100644 src/relay/op/algorithm/sort.cc

diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
index 732d5016755a..3b15f0112e4e 100644
--- a/python/tvm/relay/op/_algorithm.py
+++ b/python/tvm/relay/op/_algorithm.py
@@ -26,6 +26,10 @@
 from .op import OpPattern, register_pattern
 from .op import register_strategy
 
+# sort
+register_strategy("sort", strategy.sort_strategy)
+register_pattern("sort", OpPattern.OPAQUE)
+
 # argsort
 register_strategy("argsort", strategy.argsort_strategy)
 register_pattern("argsort", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
index e0550543f4b8..99140fcb3e11 100644
--- a/python/tvm/relay/op/algorithm.py
+++ b/python/tvm/relay/op/algorithm.py
@@ -22,6 +22,28 @@
 from ..expr import TupleWrapper, Expr, Constant
 
 
+def sort(data, axis=-1, is_ascend=1):
+    """Performs sorting along the given axis and returns data in sorted order.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    Returns
+    -------
+    out : relay.Expr
+        Tensor with same shape as data.
+    """
+    return _make.sort(data, axis, is_ascend)
+
+
 def argsort(data, axis=-1, is_ascend=1, dtype="int32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
diff --git a/python/tvm/relay/op/dyn/_algorithm.py b/python/tvm/relay/op/dyn/_algorithm.py
index ba903e680bbd..fa2678a459af 100644
--- a/python/tvm/relay/op/dyn/_algorithm.py
+++ b/python/tvm/relay/op/dyn/_algorithm.py
@@ -20,6 +20,7 @@
 
 from tvm.te.hybrid import script
 from tvm.runtime import convert
+from tvm import topi
 
 from .. import strategy
 from .. import op as _reg
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 326a184579e0..c0fc43d9f558 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -770,6 +770,26 @@ def scatter_nd_cuda(attrs, inputs, out_type, target):
         name="scatter_nd.cuda",
         plevel=10,
     )
+
+
+@sort_strategy.register(["cuda", "gpu"])
+def sort_strategy_cuda(attrs, inputs, out_type, target):
+    """sort cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sort(topi.cuda.sort),
+        wrap_topi_schedule(topi.cuda.schedule_sort),
+        name="sort.cuda",
+    )
+    if target.kind.name == "cuda" and get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
+        strategy.add_implementation(
+            wrap_compute_sort(topi.cuda.sort_thrust),
+            wrap_topi_schedule(topi.cuda.schedule_sort),
+            name="sort_thrust.cuda",
+            plevel=15,
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 18b3dd7e41cc..24a81e4dfa1f 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -769,6 +769,30 @@ def schedule_sparse_transpose(attrs, outs, target):
         return topi.generic.schedule_sparse_transpose(outs)
 
 
+# sort
+def wrap_compute_sort(topi_compute):
+    """Wrap sort topi compute"""
+
+    def _compute_sort(attrs, inputs, _):
+        axis = get_const_int(attrs.axis)
+        is_ascend = bool(get_const_int(attrs.is_ascend))
+        return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend)]
+
+    return _compute_sort
+
+
+@override_native_generic_func("sort_strategy")
+def sort_strategy(attrs, inputs, out_type, target):
+    """sort generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sort(topi.sort),
+        wrap_topi_schedule(topi.generic.schedule_sort),
+        name="sort.generic",
+    )
+    return strategy
+
+
 # argsort
 def wrap_compute_argsort(topi_compute):
     """Wrap argsort topi compute"""
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 329f0fb897e5..47684b8f778e 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -316,6 +316,89 @@ def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32")
     return out[1]
 
 
+def sort(data, axis=-1, is_ascend=1):
+    """Performs sorting along the given axis and returns an array of indicies
+    having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        The input array.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        The output of this function.
+    """
+    dtype = "float32"
+    value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
+    indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+    out = te.extern(
+        [data.shape, data.shape],
+        [data],
+        lambda ins, outs: sort_ir(ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
+        out_buffers=[value_buf, indices_buf],
+        name="sort_gpu",
+        tag="sort_gpu",
+    )[0]
+    return out
+
+
+def sort_thrust(data, axis=-1, is_ascend=1):
+    """Performs sorting along the given axis and returns an array of indicies
+    having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        The input array.
+
+    axis : int, optional
+        Axis long which to sort the input tensor.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        The output of this function.
+    """
+    dtype = "float32"
+
+    ndim = len(data.shape)
+    axis = ndim + axis if axis < 0 else axis
+
+    if axis != ndim - 1:
+        # Prepare for sorting along axis -1.
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
+    value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
+    indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+    out = te.extern(
+        [data.shape, data.shape],
+        [data],
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend
+        ),
+        out_buffers=[value_buf, indices_buf],
+        name="sort_gpu",
+        tag="sort_gpu",
+    )[0]
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        out = transpose(out, axes)
+    return out
+
+
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
@@ -408,6 +491,23 @@ def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"
     return out
 
 
+def schedule_sort(outs):
+    """Schedule for sort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argsort
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _schedule_sort(outs)
+
+
 def schedule_argsort(outs):
     """Schedule for argsort operator.
 
diff --git a/python/tvm/topi/generic/sort.py b/python/tvm/topi/generic/sort.py
index 16e6a5d70881..65df7a1a2569 100644
--- a/python/tvm/topi/generic/sort.py
+++ b/python/tvm/topi/generic/sort.py
@@ -20,6 +20,23 @@
 from .default import default_schedule as _default_schedule
 
 
+def schedule_sort(outs):
+    """Schedule for sort operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The indices that would sort an input array along
+      the given axis.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_argsort(outs):
     """Schedule for argsort operator.
 
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 98a1080660fb..576d0424634f 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -14,13 +14,55 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=too-many-arguments
+# pylint: disable=too-many-arguments, unused-argument
 """Argsort operator"""
 import tvm
 from tvm import te
 from .utils import get_const_tuple
 
 
+def sort(data, axis=-1, is_ascend=1):
+    """Performs sorting along the given axis and returns an array
+    in sorted order.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input tensor.
+
+    axis : int, optional
+        Axis along which to sort the input tensor.
+        By default the flattened array is used.
+
+    is_ascend : boolean, optional
+        Whether to sort in ascending or descending order.
+
+    dtype : string, optional
+        DType of the output indices.
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        Sorted index tensor.
+
+    """
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8)
+    out = te.extern(
+        data.shape,
+        [data],
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.sort.sort", ins[0], outs[0], axis, is_ascend
+        ),
+        dtype=data.dtype,
+        in_buffers=[data_buf],
+        out_buffers=out_buf,
+        name="sort_cpu",
+        tag="sort_cpu",
+    )
+    return out
+
+
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array
     of indices having the same shape as an input array that index
diff --git a/src/relay/op/algorithm/sort.cc b/src/relay/op/algorithm/sort.cc
new file mode 100644
index 000000000000..69a6ae55c71d
--- /dev/null
+++ b/src/relay/op/algorithm/sort.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sort.cc
+ * \brief Sort operators
+ */
+#include <tvm/relay/attrs/algorithm.h>
+#include <tvm/relay/op.h>
+
+namespace tvm {
+namespace relay {
+
+bool SortRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  ICHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "Sort: expect input type to be TensorType but get " << types[0];
+    return false;
+  }
+  reporter->Assign(types[1], TensorType(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeSort(Expr data, int axis, bool is_ascend) {
+  auto attrs = make_object<ArgsortAttrs>();
+  attrs->axis = axis;
+  attrs->is_ascend = is_ascend;
+  static const Op& op = Op::Get("sort");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.sort").set_body_typed(MakeSort);
+
+RELAY_REGISTER_OP("sort")
+    .describe(R"doc(Returns the indices that would sort an
+input array along the given axis.
+)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_attrs_type<ArgsortAttrs>()
+    .add_argument("data", "Tensor", "Input data.")
+    .set_support_level(6)
+    .add_type_rel("Sort", SortRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
index 31cf38d7d7a5..fba57d923b38 100644
--- a/src/runtime/contrib/sort/sort.cc
+++ b/src/runtime/contrib/sort/sort.cc
@@ -125,7 +125,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms").set_body([](TVMArgs args, TV
 });
 
 template <typename DataType, typename OutType>
-void argsort(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend) {
+void sort_impl(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend, bool is_argsort) {
   auto data_ptr = static_cast<DataType*>(input->data);
   auto out_ptr = static_cast<OutType*>(output->data);
   std::vector<std::pair<int64_t, DataType>> sorter;
@@ -153,13 +153,29 @@ void argsort(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend) {
       } else {
         std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<DataType>);
       }
-      for (int64_t k = 0; k < input->shape[axis]; ++k) {
-        out_ptr[base_idx + k * axis_mul_after] = static_cast<OutType>(sorter[k].first);
+      if (is_argsort) {
+        for (int64_t k = 0; k < input->shape[axis]; ++k) {
+          out_ptr[base_idx + k * axis_mul_after] = static_cast<OutType>(sorter[k].first);
+        }
+      } else {
+        for (int64_t k = 0; k < input->shape[axis]; ++k) {
+          out_ptr[base_idx + k * axis_mul_after] = static_cast<OutType>(sorter[k].second);
+        }
       }
     }
   }
 }
 
+template <typename DataType, typename OutType>
+void argsort(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend) {
+  return sort_impl<DataType, OutType>(input, output, axis, is_ascend, true);
+}
+
+template <typename DataType>
+void sort(DLTensor* input, DLTensor* output, int32_t axis, bool is_ascend) {
+  return sort_impl<DataType, DataType>(input, output, axis, is_ascend, false);
+}
+
 // Argsort implemented C library sort.
 // Return indices of sorted tensor.
 // By default, the last axis will be used to sort.
@@ -243,6 +259,47 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort").set_body([](TVMArgs args, TVMRet
   }
 });
 
+// Sort implemented C library sort.
+// Return  sorted tensor.
+// By default, the last axis will be used to sort.
+// sort_num specify the number of elements to be sorted.
+// If input tensor has dimension (d0, d1, ..., d(k-1), dk, d(k+1), ..., d(n-1))
+// and sort axis is dk. sort_num should have dimension of
+// (d1, d2, ..., d(k-1), d(k+1), ..., dn).
+TVM_REGISTER_GLOBAL("tvm.contrib.sort.sort").set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* input = args[0];
+  DLTensor* output = args[1];
+  int32_t axis = args[2];
+  bool is_ascend = args[3];
+  if (axis < 0) {
+    axis = input->ndim + axis;
+  }
+  ICHECK_LT(axis, input->ndim) << "Axis out of boundary for "
+                                  "input ndim "
+                               << input->ndim;
+
+  auto data_dtype = DLDataType2String(input->dtype);
+  auto out_dtype = DLDataType2String(output->dtype);
+
+  ICHECK_EQ(data_dtype, out_dtype);
+
+  if (data_dtype == "float32") {
+    sort<float>(input, output, axis, is_ascend);
+  } else if (data_dtype == "float64") {
+    sort<double>(input, output, axis, is_ascend);
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
+  } else if (data_dtype == "float16") {
+    sort<__fp16, __fp16>(input, output, axis, is_ascend);
+#endif
+  } else if (data_dtype == "int32") {
+    sort<int32_t>(input, output, axis, is_ascend);
+  } else if (data_dtype == "int64") {
+    sort<int64_t>(input, output, axis, is_ascend);
+  } else {
+    LOG(FATAL) << "Unsupported input dtype: " << data_dtype;
+  }
+});
+
 template <typename DataType, typename IndicesType>
 void topk(DLTensor* input, DLTensor* out_values, DLTensor* out_indices, int k, int axis,
           bool is_ascend) {
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index 52abbe2a15b6..ec6bd64d33e2 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -53,7 +53,9 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         np_indices = np_indices.astype(dtype)
 
         for target, ctx in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
+            # only test with vm, graph runtime doesn't support dynamic shapes
+            # and debug runtime doesn't support legalization for cuda kernel
+            for kind in ["vm"]:
                 mod = tvm.ir.IRModule.from_expr(func)
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(np_data, np.array([k]).astype("float32"))
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index de51c1502603..6e687658c60c 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -23,6 +23,29 @@
 import tvm.testing
 
 
+@tvm.testing.uses_gpu
+def test_sort():
+    def verify_sort(shape, axis, is_ascend):
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.sort(x, axis=axis, is_ascend=is_ascend)
+        func = relay.Function([x], z)
+        x_data = np.random.uniform(size=shape).astype("float32")
+        if is_ascend:
+            ref_res = np.sort(x_data, axis=axis)
+        else:
+            ref_res = -np.sort(-x_data, axis=axis)
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_sort((2, 3, 4), axis=0, is_ascend=False)
+    verify_sort((1, 4, 6), axis=1, is_ascend=True)
+    verify_sort((3, 5, 6), axis=-1, is_ascend=False)
+
+
 @tvm.testing.uses_gpu
 def test_argsort():
     def verify_argsort(shape, axis, is_ascend, dtype):
@@ -95,5 +118,6 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
 
 
 if __name__ == "__main__":
+    test_sort()
     test_argsort()
     test_topk()
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 7e0c982e0c42..626218f30144 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -23,6 +23,11 @@
 import tvm.topi.testing
 import tvm.testing
 
+_sort_implement = {
+    "generic": (topi.sort, topi.generic.schedule_sort),
+    "gpu": (topi.cuda.sort, topi.cuda.schedule_sort),
+}
+
 _argsort_implement = {
     "generic": (topi.argsort, topi.generic.schedule_argsort),
     "gpu": (topi.cuda.argsort, topi.cuda.schedule_argsort),
@@ -34,6 +39,46 @@
 }
 
 
+def verify_sort(axis, is_ascend):
+    dshape = (20, 100)
+    data_dtype = "float32"
+    data = te.placeholder(dshape, name="data", dtype=data_dtype)
+
+    perm = np.arange(dshape[0] * dshape[1], dtype=data_dtype)
+    np.random.shuffle(perm)
+    np_data = perm.reshape(dshape)
+
+    if is_ascend:
+        np_sort = np.sort(np_data, axis=axis)
+    else:
+        np_sort = -np.sort(-np_data, axis=axis)
+
+    if axis == 0:
+        np_sort = np_sort[: dshape[axis], :]
+    else:
+        np_sort = np_sort[:, : dshape[axis]]
+
+    def check_device(device):
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        ctx = tvm.context(device, 0)
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            fcompute, fschedule = tvm.topi.testing.dispatch(device, _sort_implement)
+            out = fcompute(data, axis=axis, is_ascend=is_ascend)
+            s = fschedule(out)
+
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx)
+        f = tvm.build(s, [data, out], device)
+        f(tvm_data, tvm_out)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_sort, rtol=1e0)
+
+    for device in ["llvm", "cuda", "opencl"]:
+        check_device(device)
+
+
 def verify_argsort(axis, is_ascend):
     dshape = (20, 100)
     data_dtype = "float32"
@@ -126,6 +171,14 @@ def check_device(device):
         check_device(device)
 
 
+@tvm.testing.uses_gpu
+def test_sort():
+    np.random.seed(0)
+    for axis in [0, -1, 1]:
+        verify_sort(axis, True)
+        verify_sort(axis, False)
+
+
 @tvm.testing.uses_gpu
 def test_argsort():
     np.random.seed(0)

From b20e67162e451e6bfb51c8ccd062d75c2a319a5e Mon Sep 17 00:00:00 2001
From: mbrookhart <mbrookhart@octoml.ai>
Date: Tue, 8 Dec 2020 14:12:28 -0700
Subject: [PATCH 2/6] fix lint

---
 python/tvm/relay/op/dyn/_algorithm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/relay/op/dyn/_algorithm.py b/python/tvm/relay/op/dyn/_algorithm.py
index fa2678a459af..ba903e680bbd 100644
--- a/python/tvm/relay/op/dyn/_algorithm.py
+++ b/python/tvm/relay/op/dyn/_algorithm.py
@@ -20,7 +20,6 @@
 
 from tvm.te.hybrid import script
 from tvm.runtime import convert
-from tvm import topi
 
 from .. import strategy
 from .. import op as _reg

From d67cfdd931491c333422b7368c117702cb5774d2 Mon Sep 17 00:00:00 2001
From: mbrookhart <mbrookhart@octoml.ai>
Date: Wed, 16 Dec 2020 12:12:58 -0700
Subject: [PATCH 3/6] fix sort docstring

---
 python/tvm/topi/cuda/sort.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 47684b8f778e..b8873e0b546b 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -317,8 +317,8 @@ def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32")
 
 
 def sort(data, axis=-1, is_ascend=1):
-    """Performs sorting along the given axis and returns an array of indicies
-    having same shape as an input array that index data in sorted order.
+    """Performs sorting along the given axis and returns an array of
+    sorted values with teh same shape as the input data.
 
     Parameters
     ----------

From ea16f08de9e05d314028b32b23a437a20c166f40 Mon Sep 17 00:00:00 2001
From: mbrookhart <mbrookhart@octoml.ai>
Date: Wed, 16 Dec 2020 15:39:59 -0700
Subject: [PATCH 4/6] fix docs

---
 python/tvm/topi/cuda/sort.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index b8873e0b546b..79f069d4cc5c 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -318,7 +318,7 @@ def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32")
 
 def sort(data, axis=-1, is_ascend=1):
     """Performs sorting along the given axis and returns an array of
-    sorted values with teh same shape as the input data.
+    sorted values with the same shape as the input data.
 
     Parameters
     ----------
@@ -351,8 +351,8 @@ def sort(data, axis=-1, is_ascend=1):
 
 
 def sort_thrust(data, axis=-1, is_ascend=1):
-    """Performs sorting along the given axis and returns an array of indicies
-    having same shape as an input array that index data in sorted order.
+    """Performs sorting along the given axis and returns an array of
+    sorted values with the same shape as the input data.
 
     Parameters
     ----------

From 79cee17b36355cd76346b6584edfe072e636bed2 Mon Sep 17 00:00:00 2001
From: mbrookhart <mbrookhart@octoml.ai>
Date: Thu, 17 Dec 2020 09:10:32 -0700
Subject: [PATCH 5/6] add TODO, shape_func, cleanup

---
 python/tvm/relay/op/_algorithm.py                | 3 +++
 python/tvm/topi/cuda/sort.py                     | 3 +++
 python/tvm/topi/sort.py                          | 2 +-
 tests/python/relay/dyn/test_dynamic_op_level6.py | 4 +---
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
index 3b15f0112e4e..0f9a6e64f1ed 100644
--- a/python/tvm/relay/op/_algorithm.py
+++ b/python/tvm/relay/op/_algorithm.py
@@ -23,16 +23,19 @@
 
 from . import strategy
 from . import op as _reg
+from _tensor import elemwise_shape_func
 from .op import OpPattern, register_pattern
 from .op import register_strategy
 
 # sort
 register_strategy("sort", strategy.sort_strategy)
 register_pattern("sort", OpPattern.OPAQUE)
+register_shape_func("sort", False, elemwise_shape_func)
 
 # argsort
 register_strategy("argsort", strategy.argsort_strategy)
 register_pattern("argsort", OpPattern.OPAQUE)
+register_shape_func("argsort", False, elemwise_shape_func)
 
 # topk
 register_strategy("topk", strategy.topk_strategy)
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 79f069d4cc5c..4162f53206a5 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -385,6 +385,9 @@ def sort_thrust(data, axis=-1, is_ascend=1):
     out = te.extern(
         [data.shape, data.shape],
         [data],
+        ## TODO(mbrookhart): This thrust function is actually doing argsort, not sort
+        ## For performance, we should probably rename the contrib function and add
+        ## a pure sort
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.thrust.sort", ins[0], outs[0], outs[1], is_ascend
         ),
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 576d0424634f..8964e363b06f 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=too-many-arguments, unused-argument
+# pylint: disable=too-many-arguments
 """Argsort operator"""
 import tvm
 from tvm import te
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index ec6bd64d33e2..52abbe2a15b6 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -53,9 +53,7 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         np_indices = np_indices.astype(dtype)
 
         for target, ctx in tvm.testing.enabled_targets():
-            # only test with vm, graph runtime doesn't support dynamic shapes
-            # and debug runtime doesn't support legalization for cuda kernel
-            for kind in ["vm"]:
+            for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
                 intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
                 op_res = intrp.evaluate()(np_data, np.array([k]).astype("float32"))

From 2202b8477a4186622b76b45932cffac19c1ad32c Mon Sep 17 00:00:00 2001
From: mbrookhart <mbrookhart@octoml.ai>
Date: Thu, 17 Dec 2020 09:26:50 -0700
Subject: [PATCH 6/6] add dynamic tests for sort and argsort

---
 python/tvm/relay/op/_algorithm.py    |  4 +--
 tests/python/relay/test_op_level6.py | 52 +++++++++++++++++++---------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
index 0f9a6e64f1ed..817f96b696df 100644
--- a/python/tvm/relay/op/_algorithm.py
+++ b/python/tvm/relay/op/_algorithm.py
@@ -23,9 +23,9 @@
 
 from . import strategy
 from . import op as _reg
-from _tensor import elemwise_shape_func
 from .op import OpPattern, register_pattern
-from .op import register_strategy
+from .op import register_strategy, register_shape_func
+from ._tensor import elemwise_shape_func
 
 # sort
 register_strategy("sort", strategy.sort_strategy)
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 6e687658c60c..a5ce1fdcf589 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -25,8 +25,11 @@
 
 @tvm.testing.uses_gpu
 def test_sort():
-    def verify_sort(shape, axis, is_ascend):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
+    def verify_sort(shape, axis, is_ascend, is_dyn=False):
+        if is_dyn:
+            x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32"))
+        else:
+            x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.sort(x, axis=axis, is_ascend=is_ascend)
         func = relay.Function([x], z)
         x_data = np.random.uniform(size=shape).astype("float32")
@@ -35,21 +38,30 @@ def verify_sort(shape, axis, is_ascend):
         else:
             ref_res = -np.sort(-x_data, axis=axis)
 
+        if is_dyn:
+            backends = ["vm", "debug"]
+        else:
+            backends = ["graph", "debug"]
         for target, ctx in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(x_data)
+            for kind in backends:
+                mod = tvm.ir.IRModule.from_expr(func)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    verify_sort((2, 3, 4), axis=0, is_ascend=False)
-    verify_sort((1, 4, 6), axis=1, is_ascend=True)
-    verify_sort((3, 5, 6), axis=-1, is_ascend=False)
+    for is_dyn in [False, True]:
+        verify_sort((2, 3, 4), axis=0, is_ascend=False, is_dyn=is_dyn)
+        verify_sort((1, 4, 6), axis=1, is_ascend=True, is_dyn=is_dyn)
+        verify_sort((3, 5, 6), axis=-1, is_ascend=False, is_dyn=is_dyn)
 
 
 @tvm.testing.uses_gpu
 def test_argsort():
-    def verify_argsort(shape, axis, is_ascend, dtype):
-        x = relay.var("x", relay.TensorType(shape, "float32"))
+    def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False):
+        if is_dyn:
+            x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32"))
+        else:
+            x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.argsort(x, axis=axis, is_ascend=is_ascend, dtype=dtype)
         func = relay.Function([x], z)
         x_data = np.random.uniform(size=shape).astype("float32")
@@ -58,16 +70,22 @@ def verify_argsort(shape, axis, is_ascend, dtype):
         else:
             ref_res = np.argsort(-x_data, axis=axis)
 
+        if is_dyn:
+            backends = ["vm", "debug"]
+        else:
+            backends = ["graph", "debug"]
         for target, ctx in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(x_data)
+            for kind in backends:
+                mod = tvm.ir.IRModule.from_expr(func)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype(dtype), rtol=1e-5)
 
-    for dtype in ["int32", "int64", "float32", "float64"]:
-        verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype)
-        verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype)
-        verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype)
+    for is_dyn in [False, True]:
+        for dtype in ["int32", "int64", "float32", "float64"]:
+            verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+            verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn)
+            verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
 
 
 @tvm.testing.uses_gpu