diff --git a/python/tvm/contrib/cutlass/gen_conv2d.py b/python/tvm/contrib/cutlass/gen_conv2d.py index b51afdc8b586..bb26a47a5548 100644 --- a/python/tvm/contrib/cutlass/gen_conv2d.py +++ b/python/tvm/contrib/cutlass/gen_conv2d.py @@ -22,6 +22,7 @@ from .conv2d_profiler import Conv2dProfilerEmitter from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP from .library import ( + DataType, EpilogueFunctor, SwizzlingFunctor, TensorDescription, @@ -133,6 +134,10 @@ def enumerate_conv2d_operators( B = TensorDescription(element_b, LayoutType.TensorNHWC, alignment) C = TensorDescription(element_c, LayoutType.TensorNHWC, alignment) + if element_c == DataType.s32 and A.alignment == 1: + tile.threadblock_shape[0] = min(tile.threadblock_shape[0], 128) + tile.threadblock_shape[1] = min(tile.threadblock_shape[1], 128) + op = Conv2dOperation( conv_kind, IteratorAlgorithm.Optimized, diff --git a/python/tvm/contrib/cutlass/gen_gemm.py b/python/tvm/contrib/cutlass/gen_gemm.py index f05969381907..f55f4f76222b 100644 --- a/python/tvm/contrib/cutlass/gen_gemm.py +++ b/python/tvm/contrib/cutlass/gen_gemm.py @@ -20,6 +20,7 @@ from .gemm_profiler import GemmProfilerEmitter from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP from .library import ( + DataType, EpilogueFunctor, SwizzlingFunctor, TensorDescription, @@ -87,6 +88,14 @@ def enumerate_gemm_operators( B = TensorDescription(element_b, LayoutType.ColumnMajor, alignment) C = TensorDescription(element_c, LayoutType.RowMajor, alignment) + if element_c == DataType.s32 and A.alignment == 1: + tile_description.threadblock_shape[0] = min( + tile_description.threadblock_shape[0], 128 + ) + tile_description.threadblock_shape[1] = min( + tile_description.threadblock_shape[1], 128 + ) + op = GemmOperation( tile_description.minimum_compute_capability, tile_description, diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index ec0d6e3a903e..08da62e640e1 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if ( - target.kind.name == "cuda" + target.kind.name in ["cuda", "vulkan"] and data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32" @@ -860,36 +860,28 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): name="dense_large_batch.gpu", plevel=5, ) - if target.kind.name == "cuda": - if nvcc.have_tensorcore(target=target): - if ( - ( - data.dtype in ["float16", "int8", "uint8"] - and ( - (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) - or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) - or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) - ) - ) - or ( - data.dtype in ["int4", "uint4"] - and i % 32 == 0 - and b % 8 == 0 - and o % 8 == 0 - ) - or ( - data.dtype in ["int1", "uint1"] - and i % 128 == 0 - and b % 8 == 0 - and o % 8 == 0 - ) - ): - strategy.add_implementation( - wrap_compute_dense(topi.cuda.dense_tensorcore), - wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), - name="dense_tensorcore.cuda", - plevel=20, + + if target.kind.name == "cuda": + if nvcc.have_tensorcore(target=target): + if ( + ( + data.dtype in ["float16", "int8", "uint8"] + and ( + (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) + or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) + or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) ) + ) + or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0) + or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0) + ): + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_tensorcore), + wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), + name="dense_tensorcore.cuda", + plevel=20, + ) + if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), @@ -927,7 +919,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( - wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), + wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas, need_out_dtype=True), wrap_topi_schedule(topi.generic.schedule_extern), name="batch_matmul_cublas.cuda", plevel=30, diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index ede1187a3e35..5fce9d7a3f5d 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -229,7 +229,7 @@ def batch_matmul_cublas( b, k, n = get_const_tuple(y.shape) if all([isinstance(s, int) for s in [b, m, n, k]]): cfg.add_flop(b * m * k * n * 2) - return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b) + return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b, dtype=out_dtype) @autotvm.register_topi_schedule("batch_matmul_cublas.cuda") diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc index 015d68aec819..b13f9e858d66 100644 --- a/src/runtime/contrib/cublas/cublas.cc +++ b/src/runtime/contrib/cublas/cublas.cc @@ -290,7 +290,7 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl) transa = IsInPlaceTransposed(A) ? !transa : transa; transb = IsInPlaceTransposed(B) ? !transb : transb; - ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type"; + ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type"; ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0) << "leading dimension must divide 4 for int8 gemm"; ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0) diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py index 648100a569d7..210e6877c926 100644 --- a/tests/python/contrib/test_cublas.py +++ b/tests/python/contrib/test_cublas.py @@ -120,8 +120,14 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5): dev = tvm.cuda(0) f = tvm.build(s, [A, B, C], "cuda") - a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev) + + if "int" in in_dtype: + a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev) + b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev) + else: + a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( @@ -161,6 +167,8 @@ def test_batch_matmul(): (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2 ) + verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32") + if __name__ == "__main__": test_matmul_add() diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py index ad75e73b26fc..c10597940221 100644 --- a/tests/python/contrib/test_cutlass.py +++ b/tests/python/contrib/test_cutlass.py @@ -725,6 +725,26 @@ def test_conv2d(): ref_target="llvm", ) + # align1 + int8 case + d_shape = (16, 3, 32, 32) + w_shape = (32, 3, 3, 3) + mod_nchw = get_conv2d_nchw( + d_shape, w_shape, padding, out_dtype="int32", data_dtype="uint8", weight_dtype="int8" + ) + + verify_conv2d( + mod_nchw, + mod_nchw, + d_shape, + w_shape, + sm=80, + atol=1e-5, + rtol=1e-5, + ref_target="llvm", + data_dtype="uint8", + weight_dtype="int8", + ) + def test_conv2d_fusion(): d_shape = (16, 16, 32, 32)