diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 807b9fdb84db..f7a12c5d04a7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -87,6 +87,12 @@ llvm::cl::opt<int> clGPUMatmulCThreshold(
     // TODO: We should get this value from the target's parallelism.
     llvm::cl::init(512 * 512));
 
+/// Flag used to toggle using mma.sync vs wmma when targetting tensorcore.
+llvm::cl::opt<bool> clGPUUseLegacySIMT(
+    "iree-codegen-llvmgpu-use-legacy-simt",
+    llvm::cl::desc("Prefer SIMT pipeline over TileAndFuse pipeline for GEMM's"),
+    llvm::cl::init(false));
+
 static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
     "iree-llvmgpu-enable-prefetch",
     llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
@@ -1049,6 +1055,14 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
         }
       }
     }
+    if (!clGPUUseLegacySIMT) {
+      // Use TileAndFuse matmul pipeline before attempting the SIMT pipeline
+      if (succeeded(
+              IREE::GPU::setMatmulLoweringConfig(target, entryPoint, op))) {
+        LDBG("Tile and fuse matmul config");
+        return success();
+      }
+    }
     // Special case for very small matrices.
     if (sizeM * sizeN <= target.getPreferredSubgroupSize()) {
       return setMatmulConfig(