From 7ba4c98719449a31ba1d32ed8b913f031a83e63f Mon Sep 17 00:00:00 2001 From: danthe3rd Date: Thu, 2 Feb 2023 12:08:30 +0000 Subject: [PATCH] Only use aligned kernels for Sm80 ghstack-source-id: 5482be804c74b33ce86686b86e617d4e5be4c6a2 Pull Request resolved: https://github.com/fairinternal/xformers/pull/449 __original_commit__ = fairinternal/xformers@14ececa1773732495f42b2680d42b38214b271f2 --- .../attention/cuda/fmha/kernels/cutlassB.h | 120 ------------------ .../kernels/cutlassB_f16_notaligned_k128.cu | 19 --- .../cutlassB_f16_notaligned_k128_dropout.cu | 19 --- .../kernels/cutlassB_f16_notaligned_k32.cu | 19 --- .../cutlassB_f16_notaligned_k32_dropout.cu | 19 --- .../kernels/cutlassB_f16_notaligned_k64.cu | 19 --- .../cutlassB_f16_notaligned_k64_dropout.cu | 19 --- .../kernels/cutlassB_f16_notaligned_k65536.cu | 19 --- .../cutlassB_f16_notaligned_k65536_dropout.cu | 19 --- .../kernels/cutlassB_f32_notaligned_k128.cu | 19 --- .../cutlassB_f32_notaligned_k128_dropout.cu | 19 --- .../kernels/cutlassB_f32_notaligned_k32.cu | 19 --- .../cutlassB_f32_notaligned_k32_dropout.cu | 19 --- .../kernels/cutlassB_f32_notaligned_k64.cu | 19 --- .../cutlassB_f32_notaligned_k64_dropout.cu | 19 --- .../kernels/cutlassB_f32_notaligned_k65536.cu | 19 --- .../cutlassB_f32_notaligned_k65536_dropout.cu | 19 --- .../attention/cuda/fmha/kernels/cutlassF.h | 45 ------- .../fmha/kernels/cutlassF_f16_notaligned.cu | 57 --------- .../fmha/kernels/cutlassF_f32_notaligned.cu | 57 --------- .../cuda/fmha/kernels/generate_kernels.py | 4 + 21 files changed, 4 insertions(+), 583 deletions(-) diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h index 1818422fd4..6f02f73aa7 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h @@ -546,38 +546,6 @@ __global__ void __launch_bounds__( AttentionBackwardKernel::kNumThreads, AttentionBackwardKernel::kMinBlocksPerSm) fmha_cutlassB_bf16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); template void dispatch_cutlassB_bf16_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k32_sm80); @@ -588,14 +556,6 @@ template void dispatch_cutlassB_bf16_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k64_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k128_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k65536_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k32_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k64_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k128_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k65536_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k32_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k64_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k128_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80); } // ======== f16 / sm80 ======== @@ -631,38 +591,6 @@ __global__ void __launch_bounds__( AttentionBackwardKernel::kNumThreads, AttentionBackwardKernel::kMinBlocksPerSm) fmha_cutlassB_f16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); template void dispatch_cutlassB_f16_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_sm80); @@ -673,14 +601,6 @@ template void dispatch_cutlassB_f16_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm80); } // ======== f32 / sm80 ======== @@ -716,38 +636,6 @@ __global__ void __launch_bounds__( AttentionBackwardKernel::kNumThreads, AttentionBackwardKernel::kMinBlocksPerSm) fmha_cutlassB_f32_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); template void dispatch_cutlassB_f32_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_sm80); @@ -758,14 +646,6 @@ template void dispatch_cutlassB_f32_sm80(T cb) { cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_dropout_sm80); cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_dropout_sm80); - cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm80); } diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu index 0972ee122c..740a0b5b1f 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k128_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu index 5e2beadf07..0e5a462634 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel< int(__CUDA_ARCH__ + 0) / 10); #endif } -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu index cfdad7bbc0..8148093759 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k32_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu index 6ac12e6523..137cc2a4d9 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu index 555de36efb..d6522fdfe5 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k64_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu index 6475941590..b4570a9b1c 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu index 167e9ba03a..a310fb46c9 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k65536_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu index 6802f67bff..b3647e7ef6 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k65536_dropout_sm75(typename AttentionBackwardKerne int(__CUDA_ARCH__ + 0) / 10); #endif } -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu index 7bea2d16c5..5a0701a26d 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k128_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu index 58a69bfa47..e212493da3 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel< int(__CUDA_ARCH__ + 0) / 10); #endif } -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu index d8651a07f0..c935a4a2c8 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k32_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu index 1e0f58b639..aa895739cf 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu index 0d66ac6deb..e7f2a6573c 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k64_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu index aeebeb5707..bca708a168 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu index bb6195c01f..009e5b1f88 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k65536_sm75(typename AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu index 3eba4d492d..ffa40a1c61 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu @@ -59,23 +59,4 @@ fmha_cutlassB_f32_notaligned_k65536_dropout_sm75(typename AttentionBackwardKerne int(__CUDA_ARCH__ + 0) / 10); #endif } -__global__ void __launch_bounds__( - AttentionBackwardKernel::kNumThreads, - AttentionBackwardKernel::kMinBlocksPerSm) -fmha_cutlassB_f32_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionBackwardKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h index b2d086bf7c..0813fede5b 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h @@ -16,26 +16,11 @@ __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) fmha_cutlassF_bf16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_bf16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_bf16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); template void dispatch_cutlassF_bf16_sm80(T cb) { cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_64x64_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_32x128_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_32x128_gmem_sm80); - cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_64x64_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_32x128_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80); } // ======== f16 / sm50 ======== @@ -156,26 +141,11 @@ __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) fmha_cutlassF_f16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); template void dispatch_cutlassF_f16_sm80(T cb) { cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm80); - cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm80); } // ======== f32 / sm50 ======== @@ -296,26 +266,11 @@ __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) fmha_cutlassF_f32_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); -__global__ void __launch_bounds__( - AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); template void dispatch_cutlassF_f32_sm80(T cb) { cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm80); cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm80); - cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm80); - cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm80); } diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu index 27996604ce..68eef9bc50 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu @@ -59,25 +59,6 @@ fmha_cutlassF_f16_notaligned_64x64_rf_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f16_notaligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) @@ -135,25 +116,6 @@ fmha_cutlassF_f16_notaligned_32x128_rf_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) @@ -211,23 +173,4 @@ fmha_cutlassF_f16_notaligned_32x128_gmem_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu index f2f29b6d99..13adba725c 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu @@ -59,25 +59,6 @@ fmha_cutlassF_f32_notaligned_64x64_rf_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f32_notaligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) @@ -135,25 +116,6 @@ fmha_cutlassF_f32_notaligned_32x128_rf_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} __global__ void __launch_bounds__( AttentionKernel::kNumThreads, AttentionKernel::kMinBlocksPerSm) @@ -211,23 +173,4 @@ fmha_cutlassF_f32_notaligned_32x128_gmem_sm75(typename AttentionKernel::kNumThreads, - AttentionKernel::kMinBlocksPerSm) -fmha_cutlassF_f32_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ >= 800 -#if __CUDA_ARCH__ < 900 - if (!p.advance_to_block()) { - return; - } - AttentionKernel::attention_kernel(p); - return; -#endif -#endif - printf( - "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", - int(__CUDA_ARCH__ + 0) / 10); -#endif -} #endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py b/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py index 44358f7533..5b97030c93 100644 --- a/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py +++ b/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py @@ -120,6 +120,8 @@ def get_all(cls) -> List["FwdKernel"]: # Remove some kernels we don't use if dtype == "bf16" and sm < 80: continue + if not aligned and sm >= 80: + continue for q, k, single_value_iter in [ (32, 128, True), (32, 128, False), @@ -211,6 +213,8 @@ def get_all(cls) -> List["BwdKernel"]: ): if dtype == "bf16" and sm < 80: continue + if not aligned and sm >= 80: + continue kernels.append( cls( aligned=aligned,