apache · jcf94 · Jun 30, 2021 · Jun 10, 2021 · Jun 10, 2021 · Jun 10, 2021
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -961,19 +961,29 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
   }
 };
 
-/*! \brief Attributes for dense operator */
-struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
+/*! \brief Attributes for matmul operator and dense operator */
+struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   IndexExpr units;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
   DataType out_dtype;
+  bool data_transposed;
+  bool weight_transposed;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
 
-  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
+  TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") {
     TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation.");
 
     // use 0 bits to indicate none.
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
+
+    TVM_ATTR_FIELD(data_transposed)
+        .set_default(false)
+        .describe("Whether the input tensor is in transposed format.");
+
+    TVM_ATTR_FIELD(weight_transposed)
+        .set_default(false)
+        .describe("Whether the weight tensor is in transposed format.");
   }
 };
 

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
@@ -44,6 +44,10 @@
 
 __all__ = ["from_tensorflow"]
 
+# By default, TVM convert `tf.matmul` to `nn.dense` op with data tensor non-transposed and weight
+# tensor transposed
+_USE_DENSE_INSTEAD_OF_MATMUL = True
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1204,7 +1208,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         return func, self._params
 
 
-def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
+def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op=True):
     """Load tensorflow graph which is a python tensorflow graph object into relay.
     The companion parameters will be handled automatically.
 
@@ -1222,6 +1226,11 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     outputs : List of output tensor names (Optional)
         if not specified then the last node is assumed as graph output.
 
+    use_dense_op : bool (Optional)
+        Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`.
+        The `nn.dense` op requires the data tensor to be non-transposed and weight tensor to be
+        transposed, may insert extra `transpose` to the original graph.
+
     Returns
     -------
     mod : tvm.IRModule
@@ -1230,6 +1239,9 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     params : dict of str to tvm.nd.NDArray
         Dict of converted parameters stored in tvm.nd.NDArray format
     """
+    global _USE_DENSE_INSTEAD_OF_MATMUL
+    if use_dense_op != _USE_DENSE_INSTEAD_OF_MATMUL:
+        _USE_DENSE_INSTEAD_OF_MATMUL = use_dense_op
 
     g = GraphProto()
     mod, params = g.from_tensorflow(graph, layout, shape, outputs)

diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1113,13 +1113,27 @@ def _impl(inputs, attr, params, mod):
 
 def _matmul():
     def _impl(inputs, attr, params, mod):
+        from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL
+
         channels = _infer_channels(inputs[1], not attr["transpose_b"])
-        if attr["transpose_a"]:
-            inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
-        if not attr["transpose_b"]:
-            inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
+        if _USE_DENSE_INSTEAD_OF_MATMUL:
+            if attr["transpose_a"]:
+                inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
+            if not attr["transpose_b"]:
+                inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
+            return AttrCvt(
+                op_name="dense",
+                extras={"units": channels},
+                ignores=["transpose_a", "transpose_b", "T"],
+            )(inputs, attr)
         return AttrCvt(
-            op_name="dense", extras={"units": channels}, ignores=["transpose_a", "transpose_b", "T"]
+            op_name="matmul",
+            extras={
+                "units": channels,
+                "data_transposed": attr["transpose_a"] or False,
+                "weight_transposed": attr["transpose_b"] or False,
+            },
+            ignores=["transpose_a", "transpose_b", "T"],
         )(inputs, attr)
 
     return _impl

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -52,6 +52,32 @@
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.matmul")
+def leaglize_matmul(attrs, inputs, types):
+    """Legalize matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.matmul_legalize(attrs, inputs, types)
+
+
+# matmul
+reg.register_strategy("nn.matmul", strategy.matmul_strategy)
+reg.register_pattern("nn.matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 @reg.register_legalize("nn.dense")
 def legalize_dense(attrs, inputs, types):
     """Legalize dense op.
@@ -1149,21 +1175,32 @@ def batch_flatten_shape_func(attrs, inputs, _):
 
 
 @script
-def _dense_shape_func(data_shape, weight_shape):
+def _matmul_shape_func(data_shape, weight_shape, data_transposed, weight_transposed):
     out = output_tensor((data_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
         out[i] = data_shape[i]
-    out[out.shape[0] - 1] = weight_shape[0]
+    if data_transposed:
+        out[out.shape[0] - 2] = out[out.shape[0] - 1]
+    out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1]
 
     return out
 
 
+@reg.register_shape_func("nn.matmul", False)
+def matmul_shape_func(attrs, inputs, _):
+    """
+    Shape function for matmul op.
+    """
+    ret = [_matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)]
+    return ret
+
+
 @reg.register_shape_func("nn.dense", False)
 def dense_shape_func(attrs, inputs, _):
     """
     Shape function for dense op.
     """
-    ret = [_dense_shape_func(inputs[0], inputs[1])]
+    ret = [_matmul_shape_func(inputs[0], inputs[1], False, True)]
     return ret
 
 

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
@@ -1471,7 +1471,46 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def dense(data, weight, units=None, out_dtype=""):
+def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False):
+    """Dense operator.
+    Applies a linear transformation. The X & W can be transposed.
+
+    .. math::
+
+    `Y = X * W`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator,
+        of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`.
+
+    weight : tvm.relay.Expr
+        The weight expressions, 2-D matrix,
+        of shape `(units_in, units)` or `(units, units_in)`.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense,
+        of shape `(d_1, d_2, ..., d_n, units)`.
+
+    data_transposed : bool, optional
+        Whether the data tensor is in transposed format.
+
+    weight_transposed : bool, optional
+        Whether the weight tensor is in transposed format.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
+
+
+def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=True):
     """Dense operator.
     Applies a linear transformation
 
@@ -1496,11 +1535,21 @@ def dense(data, weight, units=None, out_dtype=""):
         Specifies the output data type for mixed precision dense,
         of shape `(d_1, d_2, ..., d_n, units)`.
 
+    data_transposed : bool, optional
+        Whether the data tensor is in transposed format. Expected to be False.
+
+    weight_transposed : bool, optional
+        Whether the weight tensor is in transposed format. Expected to be True.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
+    # Add data_transposed & weight_transposed parameters for some API requires to apply
+    # attrs to this function
+    assert not data_transposed
+    assert weight_transposed
     return _make.dense(data, weight, units, out_dtype)
 
 

diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
@@ -64,9 +64,9 @@ class BiasAddAttrs(Attrs):
     """Atttribute of nn.bias_add"""
 
 
-@tvm._ffi.register_object("relay.attrs.DenseAttrs")
-class DenseAttrs(Attrs):
-    """Attributes for nn.dense"""
+@tvm._ffi.register_object("relay.attrs.MatmulAttrs")
+class MatmulAttrs(Attrs):
+    """Attributes for nn.matmul and nn.dense"""
 
 
 @tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -698,6 +698,26 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@matmul_strategy.register(["cuda", "gpu"])
+def matmul_strategy_cuda(attrs, inputs, out_type, target):
+    """dense cuda strategy"""
+    strategy = _op.OpStrategy()
+    if target.kind.name == "cuda" and "cublas" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.cuda.matmul_cublas),
+            wrap_topi_schedule(topi.cuda.schedule_matmul_cublas),
+            name="matmul_cublas.cuda",
+            plevel=25,
+        )
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.cuda",
+        )
+    return strategy
+
+
 @dense_strategy.register(["cuda", "gpu"])
 def dense_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -712,6 +712,42 @@ def dilation2d_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+# matmul
+def wrap_compute_matmul(topi_compute, need_auto_scheduler_layout=False):
+    """wrap matmul topi compute"""
+
+    def _compute_matmul(attrs, inputs, out_type):
+        """Compute definition of matmul"""
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+        args = [
+            inputs[0],
+            inputs[1],
+            None,
+            out_dtype,
+            attrs.data_transposed,
+            attrs.weight_transposed,
+        ]
+        if need_auto_scheduler_layout:
+            args.append(get_auto_scheduler_rewritten_layout(attrs))
+        return [topi_compute(*args)]
+
+    return _compute_matmul
+
+
+@override_native_generic_func("matmul_strategy")
+def matmul_strategy(attrs, inputs, out_type, target):
+    """matmul generic strategy"""
+    logger.warning("matmul is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_matmul(topi.nn.matmul),
+        wrap_topi_schedule(topi.generic.schedule_matmul),
+        name="matmul.generic",
+    )
+    return strategy
+
+
 # dense
 def wrap_compute_dense(topi_compute, need_auto_scheduler_layout=False):
     """wrap dense topi compute"""

diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
@@ -30,18 +30,24 @@
 logger = logging.getLogger("topi")
 
 
-@autotvm.register_topi_compute("dense_cublas.cuda")
-def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on CUDA with CUBLAS"""
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense"
+def _matmul_cublas_common(
+    cfg,
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    data_transposed=False,
+    weight_transposed=False,
+):
+    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
         out_dtype = data.dtype
     assert out_dtype == data.dtype, "Mixed precision not supported."
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-    matmul = cublas.matmul(data, weight, False, True)
+    matmul = cublas.matmul(data, weight, data_transposed, weight_transposed)
     if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
@@ -51,6 +57,34 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     return matmul
 
 
+@autotvm.register_topi_compute("matmul_cublas.cuda")
+def matmul_cublas(
+    cfg,
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    data_transposed=False,
+    weight_transposed=False,
+):
+    """Matmul operator on CUDA with CUBLAS"""
+    return _matmul_cublas_common(
+        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed
+    )
+
+
+@autotvm.register_topi_schedule("matmul_cublas.cuda")
+def schedule_matmul_cublas(_, outs):
+    """Schedule matmul operator using CUBLAS"""
+    return generic.schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("dense_cublas.cuda")
+def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA with CUBLAS"""
+    return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True)
+
+
 @autotvm.register_topi_schedule("dense_cublas.cuda")
 def schedule_dense_cublas(_, outs):
     """Schedule dense operator using CUBLAS"""