Dense strategy (apache#5)

* dense * add biforst; remove comments * address comment
kevinthesun · Jan 9, 2020 · efbeb19 · efbeb19
1 parent eadab0d
commit efbeb19
Show file tree

Hide file tree

Showing 11 changed files with 159 additions and 314 deletions.
diff --git a/python/tvm/relay/op/strategy/biforst.py b/python/tvm/relay/op/strategy/biforst.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of biforst operator strategy."""
+# pylint: disable=invalid-name,unused-argument
+
+from __future__ import absolute_import
+
+import topi
+from .generic import dense_strategy
+from .. import op as _op
+
+
+@dense_strategy.register(["biforst"])
+def dense_strategy_biforst(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implement(wrap_compute_dense(topi.biforst.dense),
+                           wrap_topi_schedule(topi.biforst.schedule_dense))
+    return strategy
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -122,21 +122,20 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 @dense_strategy.register(["cuda", "gpu"])
 def dense_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""
-    # Todo(@icemelon9): update dense strategy
     strategy = _op.OpStrategy()
     if out_type.dtype == "int8":
         strategy.add_implement(wrap_compute_dense(topi.cuda.dense_int8),
                                wrap_topi_schedule(topi.cuda.schedule_dense_int8))
     else:
-        strategy.add_implement(wrap_compute_dense(topi.nn.dense),
+        strategy.add_implement(wrap_compute_dense(topi.cuda.dense_small_batch),
                                wrap_topi_schedule(topi.cuda.schedule_dense_small_batch))
         b = inputs[0].shape[0]
         with SpecializedCondition(b >= 32):
-            strategy.add_implement(wrap_compute_dense(topi.nn.dense),
+            strategy.add_implement(wrap_compute_dense(topi.cuda.dense_large_batch),
                                    wrap_topi_schedule(topi.cuda.schedule_dense_large_batch))
     if target.target_name == "cuda" and "cublas" in target.libs:
-        strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cblas),
-                               wrap_topi_schedule(topi.generic.schedule_extern), 5)
+        strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cublas),
+                               wrap_topi_schedule(topi.cuda.schedule_dense_cublas), 5)
     return strategy
 
 @batch_matmul_strategy.register(["cuda", "gpu"])

diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
@@ -16,3 +16,17 @@
 # under the License.
 """Definition of mali operator strategy."""
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+
+from __future__ import absolute_import
+
+import topi
+from .generic import dense_strategy
+from .. import op as _op
+
+@dense_strategy.register(["mali"])
+def dense_strategy_mali(attrs, inputs, out_type, target):
+    """dense mali strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implement(wrap_compute_dense(topi.mali.dense),
+                           wrap_topi_schedule(topi.mali.schedule_dense))
+    return strategy
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
@@ -32,3 +32,18 @@ def schedule_l2_normalize_rocm(attrs, outs, target):
     """schedule L2 normalize for rocm"""
     with target:
         return topi.rocm.schedule_l2_normalize(outs)
+
+@dense_strategy.register(["rocm"])
+def dense_strategy_rocm(attrs, inputs, out_type, target):
+    """Dense strategy for ROCM"""
+    strategy = _op.OpStrategy()
+    assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"
+
+    strategy.add_implement(wrap_compute_dense(topi.rocm.dense),
+                           wrap_topi_schedule(topi.rocm.schedule_dense))
+    if target.target_name == "rocm" and "rocblas" in target.libs:
+        assert out_dtype == inputs[0].dtype, "Mixed precision not supported."
+        strategy.add_implement(
+            wrap_compute_dense(topi.rocm.dense_cblas),
+            wrap_topi_schedule(topi.rocm.dense_cblas), 5)
+    return strategy
diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py
@@ -15,19 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,unused-variable
-"""dense schedule on ARM Mali GPU"""
+"""dense schedule on ARM Mali Biforst GPU"""
 
 from __future__ import absolute_import as _abs
 
 import tvm
 from tvm import autotvm
 
-from .. import generic, nn
+from .. import nn
 from ..util import traverse_inline
 
-autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault)
+@autotvm.register_topi_compute2('dense_biforst')
+def dense(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on Biforst"""
+    return nn.dense(data, weight, bias, out_dtype)
 
-@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct')
+@autotvm.register_topi_schedule2('dense_bifrost')
 def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
@@ -31,7 +31,7 @@
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import schedule_dense
+from .dense import *
 from .pooling import *
 from .nn import schedule_lrn, schedule_l2_normalize
 from .batch_matmul import *

diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
@@ -23,54 +23,16 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
 from .tensor_intrin import dp4a
-from ..nn.dense import dense, dense_default
+from .. import nn
 from .. import tag
 from .. import generic
 from ..util import traverse_inline, get_const_tuple
 
 logger = logging.getLogger('topi')
 
-
-#@autotvm.register_topi_compute(dense, ["cuda", "gpu"], "direct")
-def dense_cuda(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for cuda backend.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.Tensor, optional
-        1-D with shape [out_dim]
-
-    Returns
-    -------
-    output : tvm.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    # pylint: disable=unused-argument
-    assert len(data.shape) == 2 and len(weight.shape) == 2, \
-        "only support 2-dim dense"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = data.dtype
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
-    target = tvm.target.current_target()
-    if "cublas" in target.libs:
-        matmul = cublas.matmul(data, weight, False, True, out_dtype)
-        if bias is not None:
-            matmul = tvm.compute((batch, out_dim), \
-                                 lambda i, j: matmul[i, j] + bias[j], \
-                                 tag=tag.BROADCAST)
-        return matmul
-    return dense_default(data, weight, bias, out_dtype)
-
+@autotvm.register_topi_compute2("dense_cublas.cuda")
 def dense_cublas(data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA with CUBLAS"""
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
         "only support 2-dim dense"
     if bias is not None:
@@ -87,61 +49,20 @@ def dense_cublas(data, weight, bias=None, out_dtype=None):
                              tag=tag.BROADCAST)
     return matmul
 
-@autotvm.register_topi_schedule(generic.schedule_dense, ["cuda", "gpu"], "direct")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    # pylint: disable=unused-argument
-    target = tvm.target.current_target()
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    if target.target_name == "cuda" and "cublas" in target.libs:
-        return generic.schedule_extern(outs)
-
-    s = tvm.create_schedule([x.op for x in outs])
 
-    def _schedule(C):
-        A, _ = C.op.input_tensors
-        batch, _ = get_const_tuple(A.shape)
-        if batch < 32:
-            return schedule_dense_small_batch(cfg, s, C)
-        return schedule_dense_large_batch(cfg, s, C)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule dense
-        elif OP.tag == 'dense':
-            Dense = OP.output(0)
-            _schedule(Dense)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+@autotvm.register_topi_schedule2("dense_cublas.cuda")
+def schedule_dense_cublas(_, outs):
+    """Schedule dense operator using CUBLAS"""
+    return generic.schedule_extern(outs)
 
-        scheduled_ops.append(OP)
 
-    traverse(outs[0].op)
-    return s
+@autotvm.register_topi_compute2("dense_small_batch.cuda")
+def dense_small_batch(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA"""
+    return nn.dense(data, weight, bias, out_dtype)
 
 
+@autotvm.register_topi_schedule2("dense_small_batch.cuda")
 def schedule_dense_small_batch(cfg, s, C):
     """Schedule float32/64 dense with small batch size"""
     A, _ = C.op.input_tensors
@@ -168,6 +89,14 @@ def schedule_dense_small_batch(cfg, s, C):
     s[C].set_store_predicate(thread_x.var.equal(0))
     s[Out].set_store_predicate(thread_x.var.equal(0))
 
+
+@autotvm.register_topi_compute2("dense_large_batch.cuda")
+def dense_large_batch(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA"""
+    return nn.dense(data, weight, bias, out_dtype)
+
+
+@autotvm.register_topi_schedule2("dense_large_batch.cuda")
 def schedule_dense_large_batch(cfg, s, C):
     """Schedule float32/64 dense with large batch size"""
     A, B = C.op.input_tensors
@@ -266,8 +195,6 @@ def schedule_dense_large_batch(cfg, s, C):
     s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
     s[BB].double_buffer()
 
-#@autotvm.register_topi_compute(dense, ['cuda'], ['int8'])
-
 @autotvm.register_topi_compute2("dense_int8.cuda")
 def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
     """Dense operator for int8 on CUDA"""
@@ -304,7 +231,6 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
     return matmul
 
 
-#@autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8'])
 @autotvm.register_topi_schedule2("dense_int8.cuda")
 def schedule_dense_int8(cfg, outs):
     """Dense schedule for int8 on CUDA"""

diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
@@ -22,12 +22,18 @@
 import tvm
 from tvm import autotvm
 
-from .. import generic, nn
+from .. import nn
 from ..util import traverse_inline
 
-autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault)
 
-@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct')
+
+@autotvm.register_topi_compute2('dense.mali')
+def dense(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on Mali"""
+    return nn.dense(data, weight, bias, out_dtype)
+
+
+@autotvm.register_topi_schedule2('dense.mali')
 def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
@@ -52,11 +58,11 @@ def _callback(op):
             vec_size = [1, 2, 4, 8, 16]
             max_unroll = 32
 
-            dense = op.output(0)
+            dense_out = op.output(0)
             output = outs[0]
 
             y, x = s[output].op.axis
-            c = s[dense].op.reduce_axis[0]
+            c = s[dense_out].op.reduce_axis[0]
 
             ##### space definition begin #####
             cfg.define_split('tile_y', y, num_outputs=3)
@@ -70,8 +76,8 @@ def _callback(op):
                 cfg.fallback_with_reference_log(ref_log)
             ##### space definition end #####
 
-            if dense.op in s.outputs:
-                dense = s.cache_write(output, 'local')
+            if dense_out.op in s.outputs:
+                dense_out = s.cache_write(output, 'local')
 
             by, ty, yi = cfg['tile_y'].apply(s, output, y)
             bx, tx, xi = cfg['tile_x'].apply(s, output, x)
@@ -85,23 +91,25 @@ def _callback(op):
                 s[output].unroll(yi)
             if cfg['tile_x'].size[-1] in vec_size:
                 s[output].vectorize(xi)
-            s[dense].compute_at(s[output], tx)
+            s[dense_out].compute_at(s[output], tx)
 
-            k = s[dense].op.reduce_axis[0]
-            y, x = s[dense].op.axis
-            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
-            s[dense].reorder(k, k_unroll, y, x)
-            s[dense].unroll(k_unroll)
+            k = s[dense_out].op.reduce_axis[0]
+            y, x = s[dense_out].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k)
+            s[dense_out].reorder(k, k_unroll, y, x)
+            s[dense_out].unroll(k_unroll)
             if cfg['tile_y'].size[-1] < max_unroll:
-                s[dense].unroll(y)
+                s[dense_out].unroll(y)
             if cfg['tile_x'].size[-1] in vec_size:
-                s[dense].vectorize(x)
+                s[dense_out].vectorize(x)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
+
 def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     """ fuse all the axis and bind to GPU threads """
+    # TODO(@comaniac): figure out where this function is used.
     axis = axis or s[tensor].op.axis
     fused = s[tensor].fuse(*axis)
     bx, tx = s[tensor].split(fused, num_thread)