diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 807b9fdb84db..f7a12c5d04a7 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -87,6 +87,12 @@ llvm::cl::opt clGPUMatmulCThreshold( // TODO: We should get this value from the target's parallelism. llvm::cl::init(512 * 512)); +/// Flag used to toggle using mma.sync vs wmma when targetting tensorcore. +llvm::cl::opt clGPUUseLegacySIMT( + "iree-codegen-llvmgpu-use-legacy-simt", + llvm::cl::desc("Prefer SIMT pipeline over TileAndFuse pipeline for GEMM's"), + llvm::cl::init(false)); + static llvm::cl::opt clLLVMGPUEnablePrefetch( "iree-llvmgpu-enable-prefetch", llvm::cl::desc("Enable prefetch in the vector distribute pipeline"), @@ -1049,6 +1055,14 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target, } } } + if (!clGPUUseLegacySIMT) { + // Use TileAndFuse matmul pipeline before attempting the SIMT pipeline + if (succeeded( + IREE::GPU::setMatmulLoweringConfig(target, entryPoint, op))) { + LDBG("Tile and fuse matmul config"); + return success(); + } + } // Special case for very small matrices. if (sizeM * sizeN <= target.getPreferredSubgroupSize()) { return setMatmulConfig(