Skip to content

Commit

Permalink
cutlassB: Support Sm86/Sm89
Browse files Browse the repository at this point in the history
We reduce shared-memory usage byreducing the block size, as these GPUs have 99kb instead of 163kb (A100)

ghstack-source-id: 7c0d0460b88959a3f98aa5154882679921555b71
Pull Request resolved: https://github.com/fairinternal/xformers/pull/454

__original_commit__ = fairinternal/xformers@c8072ad23064ce367017b78f06bea0b399fadc91
  • Loading branch information
danthe3rd authored and xFormers Bot committed Feb 2, 2023
1 parent 8b82140 commit 615175f
Show file tree
Hide file tree
Showing 19 changed files with 693 additions and 32 deletions.
140 changes: 140 additions & 0 deletions xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,23 @@ fmha_cutlassB_bf16_aligned_128x128_k128_sm80(typename AttentionBackwardKernel<cu
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_bf16_aligned_64x64_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_bf16_aligned_64x64_k128_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,23 @@ fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80(typename AttentionBackwardK
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,23 @@ fmha_cutlassB_bf16_aligned_128x64_k65536_sm80(typename AttentionBackwardKernel<c
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_bf16_aligned_64x64_k65536_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, false, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_bf16_aligned_64x64_k65536_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,23 @@ fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80(typename AttentionBackward
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::bfloat16_t, true, true, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,61 @@ fmha_cutlassB_f16_aligned_64x64_k128_sm50(typename AttentionBackwardKernel<cutla
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_sm70(typename AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ < 750
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_sm70` is for sm70-sm75, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_sm75(typename AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ < 800
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_sm75` is for sm75-sm80, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,61 @@ fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50(typename AttentionBackwardKern
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70(typename AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ < 750
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70` is for sm70-sm75, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75(typename AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ < 800
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75` is for sm75-sm80, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,61 @@ fmha_cutlassB_f16_aligned_64x64_k65536_sm50(typename AttentionBackwardKernel<cut
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_sm70(typename AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ < 750
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, false, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_sm70` is for sm70-sm75, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_sm75(typename AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ < 800
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, false, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_sm75` is for sm75-sm80, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, false, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,61 @@ fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50(typename AttentionBackwardKe
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70(typename AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ < 750
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, true, true, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70` is for sm70-sm75, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75(typename AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ < 800
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, true, true, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75` is for sm75-sm80, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 65536>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 65536>::kMinBlocksPerSm)
fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80(typename AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 65536>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 800
#if __CUDA_ARCH__ < 900
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm80, cutlass::half_t, true, true, false, 64, 64, 65536>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,42 @@ fmha_cutlassB_f16_notaligned_64x64_k128_sm50(typename AttentionBackwardKernel<cu
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, false, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, false, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_64x64_k128_sm70(typename AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, false, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ < 750
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm70, cutlass::half_t, false, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_64x64_k128_sm70` is for sm70-sm75, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
__global__ void __launch_bounds__(
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, false, false, false, 64, 64, 128>::kNumThreads,
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, false, false, false, 64, 64, 128>::kMinBlocksPerSm)
fmha_cutlassB_f16_notaligned_64x64_k128_sm75(typename AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, false, false, false, 64, 64, 128>::Params p) {
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ >= 750
#if __CUDA_ARCH__ < 800
if (!p.advance_to_block()) {
return;
}
AttentionBackwardKernel<cutlass::arch::Sm75, cutlass::half_t, false, false, false, 64, 64, 128>::attention_kernel(p);
return;
#endif
#endif
printf(
"FATAL: kernel `fmha_cutlassB_f16_notaligned_64x64_k128_sm75` is for sm75-sm80, but was built for sm%d\n",
int(__CUDA_ARCH__ + 0) / 10);
#endif
}
#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD
Loading

0 comments on commit 615175f

Please sign in to comment.