Skip to content

Commit

Permalink
Only use aligned kernels for Sm80
Browse files Browse the repository at this point in the history
ghstack-source-id: 5482be804c74b33ce86686b86e617d4e5be4c6a2
Pull Request resolved: https://github.com/fairinternal/xformers/pull/449

__original_commit__ = fairinternal/xformers@14ececa1773732495f42b2680d42b38214b271f2
  • Loading branch information
danthe3rd authored and xFormers Bot committed Feb 2, 2023
1 parent 87dc3a7 commit 7ba4c98
Show file tree
Hide file tree
Showing 21 changed files with 4 additions and 583 deletions.
120 changes: 0 additions & 120 deletions xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,38 +546,6 @@ __global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_bf16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 32>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k32_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 64>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k64_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 128>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 65536>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k65536_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 32>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 64>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 128>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 65536>::Params p);

template <typename T> void dispatch_cutlassB_bf16_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, 32>(), fmha_cutlassB_bf16_aligned_k32_sm80);
Expand All @@ -588,14 +556,6 @@ template <typename T> void dispatch_cutlassB_bf16_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 64>(), fmha_cutlassB_bf16_aligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 128>(), fmha_cutlassB_bf16_aligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, 65536>(), fmha_cutlassB_bf16_aligned_k65536_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 32>(), fmha_cutlassB_bf16_notaligned_k32_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 64>(), fmha_cutlassB_bf16_notaligned_k64_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 128>(), fmha_cutlassB_bf16_notaligned_k128_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, false, 65536>(), fmha_cutlassB_bf16_notaligned_k65536_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 32>(), fmha_cutlassB_bf16_notaligned_k32_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 64>(), fmha_cutlassB_bf16_notaligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 128>(), fmha_cutlassB_bf16_notaligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, false, true, 65536>(), fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80);
}

// ======== f16 / sm80 ========
Expand Down Expand Up @@ -631,38 +591,6 @@ __global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 64>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k64_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k65536_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 64>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 65536>::Params p);

template <typename T> void dispatch_cutlassB_f16_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, 32>(), fmha_cutlassB_f16_aligned_k32_sm80);
Expand All @@ -673,14 +601,6 @@ template <typename T> void dispatch_cutlassB_f16_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 64>(), fmha_cutlassB_f16_aligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 128>(), fmha_cutlassB_f16_aligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, 65536>(), fmha_cutlassB_f16_aligned_k65536_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>(), fmha_cutlassB_f16_notaligned_k32_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 64>(), fmha_cutlassB_f16_notaligned_k64_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>(), fmha_cutlassB_f16_notaligned_k128_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 65536>(), fmha_cutlassB_f16_notaligned_k65536_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>(), fmha_cutlassB_f16_notaligned_k32_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 64>(), fmha_cutlassB_f16_notaligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>(), fmha_cutlassB_f16_notaligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 65536>(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm80);
}

// ======== f32 / sm80 ========
Expand Down Expand Up @@ -716,38 +636,6 @@ __global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f32_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 32>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k32_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 64>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k64_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 128>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k65536_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 65536>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 32>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 32>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 64>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 64>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 64>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 128>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 128>::Params p);
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f32_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 65536>::Params p);

template <typename T> void dispatch_cutlassB_f32_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, true, false, 32>(), fmha_cutlassB_f32_aligned_k32_sm80);
Expand All @@ -758,14 +646,6 @@ template <typename T> void dispatch_cutlassB_f32_sm80(T cb) {
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 64>(), fmha_cutlassB_f32_aligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 128>(), fmha_cutlassB_f32_aligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, true, true, 65536>(), fmha_cutlassB_f32_aligned_k65536_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 32>(), fmha_cutlassB_f32_notaligned_k32_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 64>(), fmha_cutlassB_f32_notaligned_k64_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 128>(), fmha_cutlassB_f32_notaligned_k128_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, false, 65536>(), fmha_cutlassB_f32_notaligned_k65536_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 32>(), fmha_cutlassB_f32_notaligned_k32_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 64>(), fmha_cutlassB_f32_notaligned_k64_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 128>(), fmha_cutlassB_f32_notaligned_k128_dropout_sm80);
cb(AttentionBackwardKernel<cutlass::arch::Sm80, float, false, true, 65536>(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm80);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k128_sm75(typename AttentionBackwardKernel<cutlass:
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel<
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k32_sm75(typename AttentionBackwardKernel<cutlass::
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, false, 32>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,4 @@ fmha_cutlassB_f16_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel<c
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, false, true, 32>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Loading

0 comments on commit 7ba4c98

Please sign in to comment.