Skip to content

Commit

Permalink
Dense strategy (apache#5)
Browse files Browse the repository at this point in the history
* dense

* add biforst; remove comments

* address comment
  • Loading branch information
comaniac authored and icemelon committed Jan 9, 2020
1 parent eadab0d commit efbeb19
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 314 deletions.
32 changes: 32 additions & 0 deletions python/tvm/relay/op/strategy/biforst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Definition of biforst operator strategy."""
# pylint: disable=invalid-name,unused-argument

from __future__ import absolute_import

import topi
from .generic import dense_strategy
from .. import op as _op


@dense_strategy.register(["biforst"])
def dense_strategy_biforst(attrs, inputs, out_type, target):
strategy = _op.OpStrategy()
strategy.add_implement(wrap_compute_dense(topi.biforst.dense),
wrap_topi_schedule(topi.biforst.schedule_dense))
return strategy
9 changes: 4 additions & 5 deletions python/tvm/relay/op/strategy/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,21 +122,20 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
@dense_strategy.register(["cuda", "gpu"])
def dense_strategy_cuda(attrs, inputs, out_type, target):
"""dense cuda strategy"""
# Todo(@icemelon9): update dense strategy
strategy = _op.OpStrategy()
if out_type.dtype == "int8":
strategy.add_implement(wrap_compute_dense(topi.cuda.dense_int8),
wrap_topi_schedule(topi.cuda.schedule_dense_int8))
else:
strategy.add_implement(wrap_compute_dense(topi.nn.dense),
strategy.add_implement(wrap_compute_dense(topi.cuda.dense_small_batch),
wrap_topi_schedule(topi.cuda.schedule_dense_small_batch))
b = inputs[0].shape[0]
with SpecializedCondition(b >= 32):
strategy.add_implement(wrap_compute_dense(topi.nn.dense),
strategy.add_implement(wrap_compute_dense(topi.cuda.dense_large_batch),
wrap_topi_schedule(topi.cuda.schedule_dense_large_batch))
if target.target_name == "cuda" and "cublas" in target.libs:
strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cblas),
wrap_topi_schedule(topi.generic.schedule_extern), 5)
strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cublas),
wrap_topi_schedule(topi.cuda.schedule_dense_cublas), 5)
return strategy

@batch_matmul_strategy.register(["cuda", "gpu"])
Expand Down
14 changes: 14 additions & 0 deletions python/tvm/relay/op/strategy/mali.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@
# under the License.
"""Definition of mali operator strategy."""
# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import

from __future__ import absolute_import

import topi
from .generic import dense_strategy
from .. import op as _op

@dense_strategy.register(["mali"])
def dense_strategy_mali(attrs, inputs, out_type, target):
"""dense mali strategy"""
strategy = _op.OpStrategy()
strategy.add_implement(wrap_compute_dense(topi.mali.dense),
wrap_topi_schedule(topi.mali.schedule_dense))
return strategy
15 changes: 15 additions & 0 deletions python/tvm/relay/op/strategy/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,18 @@ def schedule_l2_normalize_rocm(attrs, outs, target):
"""schedule L2 normalize for rocm"""
with target:
return topi.rocm.schedule_l2_normalize(outs)

@dense_strategy.register(["rocm"])
def dense_strategy_rocm(attrs, inputs, out_type, target):
"""Dense strategy for ROCM"""
strategy = _op.OpStrategy()
assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"

strategy.add_implement(wrap_compute_dense(topi.rocm.dense),
wrap_topi_schedule(topi.rocm.schedule_dense))
if target.target_name == "rocm" and "rocblas" in target.libs:
assert out_dtype == inputs[0].dtype, "Mixed precision not supported."
strategy.add_implement(
wrap_compute_dense(topi.rocm.dense_cblas),
wrap_topi_schedule(topi.rocm.dense_cblas), 5)
return strategy
11 changes: 7 additions & 4 deletions topi/python/topi/bifrost/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,22 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name,unused-variable
"""dense schedule on ARM Mali GPU"""
"""dense schedule on ARM Mali Biforst GPU"""

from __future__ import absolute_import as _abs

import tvm
from tvm import autotvm

from .. import generic, nn
from .. import nn
from ..util import traverse_inline

autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault)
@autotvm.register_topi_compute2('dense_biforst')
def dense(_, data, weight, bias=None, out_dtype=None):
"""Dense operator on Biforst"""
return nn.dense(data, weight, bias, out_dtype)

@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct')
@autotvm.register_topi_schedule2('dense_bifrost')
def schedule_dense(cfg, outs):
"""Schedule for dense operator.
Expand Down
2 changes: 1 addition & 1 deletion topi/python/topi/cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from .reduction import schedule_reduce
from .softmax import schedule_softmax
from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
from .dense import schedule_dense
from .dense import *
from .pooling import *
from .nn import schedule_lrn, schedule_l2_normalize
from .batch_matmul import *
Expand Down
114 changes: 20 additions & 94 deletions topi/python/topi/cuda/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,54 +23,16 @@
from tvm.autotvm.task.space import SplitEntity
from tvm.contrib import cublas
from .tensor_intrin import dp4a
from ..nn.dense import dense, dense_default
from .. import nn
from .. import tag
from .. import generic
from ..util import traverse_inline, get_const_tuple

logger = logging.getLogger('topi')


#@autotvm.register_topi_compute(dense, ["cuda", "gpu"], "direct")
def dense_cuda(cfg, data, weight, bias=None, out_dtype=None):
"""Dense operator for cuda backend.
Parameters
----------
data : tvm.Tensor
2-D with shape [batch, in_dim]
weight : tvm.Tensor
2-D with shape [out_dim, in_dim]
bias : tvm.Tensor, optional
1-D with shape [out_dim]
Returns
-------
output : tvm.Tensor
2-D with shape [batch, out_dim]
"""
# pylint: disable=unused-argument
assert len(data.shape) == 2 and len(weight.shape) == 2, \
"only support 2-dim dense"
if bias is not None:
assert len(bias.shape) == 1
if out_dtype is None:
out_dtype = data.dtype
batch, in_dim = data.shape
out_dim, _ = weight.shape
target = tvm.target.current_target()
if "cublas" in target.libs:
matmul = cublas.matmul(data, weight, False, True, out_dtype)
if bias is not None:
matmul = tvm.compute((batch, out_dim), \
lambda i, j: matmul[i, j] + bias[j], \
tag=tag.BROADCAST)
return matmul
return dense_default(data, weight, bias, out_dtype)

@autotvm.register_topi_compute2("dense_cublas.cuda")
def dense_cublas(data, weight, bias=None, out_dtype=None):
"""Dense operator on CUDA with CUBLAS"""
assert len(data.shape) == 2 and len(weight.shape) == 2, \
"only support 2-dim dense"
if bias is not None:
Expand All @@ -87,61 +49,20 @@ def dense_cublas(data, weight, bias=None, out_dtype=None):
tag=tag.BROADCAST)
return matmul

@autotvm.register_topi_schedule(generic.schedule_dense, ["cuda", "gpu"], "direct")
def schedule_dense(cfg, outs):
"""Schedule for dense operator.
Parameters
----------
outs: Array of Tensor
The computation graph description of dense
in the format of an array of tensors.
Returns
-------
s: Schedule
The computation schedule for dense.
"""
# pylint: disable=unused-argument
target = tvm.target.current_target()

outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
if target.target_name == "cuda" and "cublas" in target.libs:
return generic.schedule_extern(outs)

s = tvm.create_schedule([x.op for x in outs])

def _schedule(C):
A, _ = C.op.input_tensors
batch, _ = get_const_tuple(A.shape)
if batch < 32:
return schedule_dense_small_batch(cfg, s, C)
return schedule_dense_large_batch(cfg, s, C)

scheduled_ops = []

def traverse(OP):
"""Internal traverse function"""
# inline all one-to-one-mapping operators except the last stage (output)
if tag.is_broadcast(OP.tag):
if OP not in s.outputs:
s[OP].compute_inline()
for tensor in OP.input_tensors:
if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
traverse(tensor.op)
# schedule dense
elif OP.tag == 'dense':
Dense = OP.output(0)
_schedule(Dense)
else:
raise RuntimeError("Unsupported operator: %s" % OP.tag)
@autotvm.register_topi_schedule2("dense_cublas.cuda")
def schedule_dense_cublas(_, outs):
"""Schedule dense operator using CUBLAS"""
return generic.schedule_extern(outs)

scheduled_ops.append(OP)

traverse(outs[0].op)
return s
@autotvm.register_topi_compute2("dense_small_batch.cuda")
def dense_small_batch(_, data, weight, bias=None, out_dtype=None):
"""Dense operator on CUDA"""
return nn.dense(data, weight, bias, out_dtype)


@autotvm.register_topi_schedule2("dense_small_batch.cuda")
def schedule_dense_small_batch(cfg, s, C):
"""Schedule float32/64 dense with small batch size"""
A, _ = C.op.input_tensors
Expand All @@ -168,6 +89,14 @@ def schedule_dense_small_batch(cfg, s, C):
s[C].set_store_predicate(thread_x.var.equal(0))
s[Out].set_store_predicate(thread_x.var.equal(0))


@autotvm.register_topi_compute2("dense_large_batch.cuda")
def dense_large_batch(_, data, weight, bias=None, out_dtype=None):
"""Dense operator on CUDA"""
return nn.dense(data, weight, bias, out_dtype)


@autotvm.register_topi_schedule2("dense_large_batch.cuda")
def schedule_dense_large_batch(cfg, s, C):
"""Schedule float32/64 dense with large batch size"""
A, B = C.op.input_tensors
Expand Down Expand Up @@ -266,8 +195,6 @@ def schedule_dense_large_batch(cfg, s, C):
s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
s[BB].double_buffer()

#@autotvm.register_topi_compute(dense, ['cuda'], ['int8'])

@autotvm.register_topi_compute2("dense_int8.cuda")
def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
"""Dense operator for int8 on CUDA"""
Expand Down Expand Up @@ -304,7 +231,6 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
return matmul


#@autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8'])
@autotvm.register_topi_schedule2("dense_int8.cuda")
def schedule_dense_int8(cfg, outs):
"""Dense schedule for int8 on CUDA"""
Expand Down
38 changes: 23 additions & 15 deletions topi/python/topi/mali/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,18 @@
import tvm
from tvm import autotvm

from .. import generic, nn
from .. import nn
from ..util import traverse_inline

autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault)

@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct')

@autotvm.register_topi_compute2('dense.mali')
def dense(_, data, weight, bias=None, out_dtype=None):
"""Dense operator on Mali"""
return nn.dense(data, weight, bias, out_dtype)


@autotvm.register_topi_schedule2('dense.mali')
def schedule_dense(cfg, outs):
"""Schedule for dense operator.
Expand All @@ -52,11 +58,11 @@ def _callback(op):
vec_size = [1, 2, 4, 8, 16]
max_unroll = 32

dense = op.output(0)
dense_out = op.output(0)
output = outs[0]

y, x = s[output].op.axis
c = s[dense].op.reduce_axis[0]
c = s[dense_out].op.reduce_axis[0]

##### space definition begin #####
cfg.define_split('tile_y', y, num_outputs=3)
Expand All @@ -70,8 +76,8 @@ def _callback(op):
cfg.fallback_with_reference_log(ref_log)
##### space definition end #####

if dense.op in s.outputs:
dense = s.cache_write(output, 'local')
if dense_out.op in s.outputs:
dense_out = s.cache_write(output, 'local')

by, ty, yi = cfg['tile_y'].apply(s, output, y)
bx, tx, xi = cfg['tile_x'].apply(s, output, x)
Expand All @@ -85,23 +91,25 @@ def _callback(op):
s[output].unroll(yi)
if cfg['tile_x'].size[-1] in vec_size:
s[output].vectorize(xi)
s[dense].compute_at(s[output], tx)
s[dense_out].compute_at(s[output], tx)

k = s[dense].op.reduce_axis[0]
y, x = s[dense].op.axis
k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
s[dense].reorder(k, k_unroll, y, x)
s[dense].unroll(k_unroll)
k = s[dense_out].op.reduce_axis[0]
y, x = s[dense_out].op.axis
k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k)
s[dense_out].reorder(k, k_unroll, y, x)
s[dense_out].unroll(k_unroll)
if cfg['tile_y'].size[-1] < max_unroll:
s[dense].unroll(y)
s[dense_out].unroll(y)
if cfg['tile_x'].size[-1] in vec_size:
s[dense].vectorize(x)
s[dense_out].vectorize(x)

traverse_inline(s, outs[0].op, _callback)
return s


def fuse_and_bind(s, tensor, axis=None, num_thread=None):
""" fuse all the axis and bind to GPU threads """
# TODO(@comaniac): figure out where this function is used.
axis = axis or s[tensor].op.axis
fused = s[tensor].fuse(*axis)
bx, tx = s[tensor].split(fused, num_thread)
Expand Down
Loading

0 comments on commit efbeb19

Please sign in to comment.