diff --git a/.github/workflows/linters_reusable.yml b/.github/workflows/linters_reusable.yml index 3e958e1d1d..a1209a3a40 100644 --- a/.github/workflows/linters_reusable.yml +++ b/.github/workflows/linters_reusable.yml @@ -45,5 +45,5 @@ jobs: sudo apt-get install clang-format clang-format --version - # apply to our files - ./.circleci/run-clang-format.py -r xformers/csrc + # apply to our files - excluding autogenerated files + ./.circleci/run-clang-format.py -e "*fmha/kernels" -r xformers/csrc diff --git a/xformers/csrc/attention/cuda/fmha/attention_backward_generic.cu b/xformers/csrc/attention/cuda/fmha/attention_backward_generic.cu index 7c356945da..ef617f4cd1 100644 --- a/xformers/csrc/attention/cuda/fmha/attention_backward_generic.cu +++ b/xformers/csrc/attention/cuda/fmha/attention_backward_generic.cu @@ -12,66 +12,9 @@ #include "gemm_kernel_utils.h" #include "kernel_backward.h" +#include "kernels/cutlassB.h" #include "pytorch_utils.h" -#define DISPATCH_MAXK(func) \ - { \ - const auto maxK = std::max(query.size(3), value.size(3)); \ - if (maxK <= 64) { \ - constexpr int kMaxK = 64; \ - func(); \ - } else if (maxK <= 128) { \ - constexpr int kMaxK = 128; \ - func(); \ - } else { \ - constexpr int kMaxK = std::numeric_limits::max(); \ - func(); \ - } \ - } - -#define DISPATCH_KERNEL(QUERY, KEY, VALUE, USE_DROPOUT, FUNC) \ - { \ - cudaDeviceProp* properties = \ - at::cuda::getDeviceProperties(QUERY.device().index()); \ - const int computeCapability = properties->major * 10 + properties->minor; \ - DISPATCH_MAXK(([&] { \ - DISPATCH_TYPES( \ - QUERY, ([&]() { \ - DISPATCH_BOOL( \ - USE_DROPOUT, kApplyDropout, ([&]() { \ - DISPATCH_ARCHTAG( \ - computeCapability, ([&]() { \ - using AlignedAK = AttentionBackwardKernel< \ - ArchTag, \ - scalar_t, \ - true, \ - kApplyDropout, \ - kMaxK>; \ - bool isAligned = \ - (QUERY.stride(2) % \ - AlignedAK::kOptimalAlignement == \ - 0 && \ - KEY.stride(2) % AlignedAK::kOptimalAlignement == \ - 0 && \ - VALUE.stride(2) % \ - AlignedAK::kOptimalAlignement == \ - 0); \ - DISPATCH_BOOL(isAligned, kIsAligned, ([&]() { \ - using Kernel = \ - AttentionBackwardKernel< \ - ArchTag, \ - scalar_t, \ - kIsAligned, \ - kApplyDropout, \ - kMaxK>; \ - FUNC(); \ - })) \ - })) \ - })) \ - })) \ - })); \ - } - namespace { std::tuple mem_efficient_attention_backward_cutlass( @@ -175,11 +118,32 @@ mem_efficient_attention_backward_cutlass( const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO; at::PhiloxCudaState rng_engine_inputs(rng_seed, rng_offset); - auto launchKernel = [&](auto _k, int computeCapability) { + bool kernel_launched = false; + const auto maxK = std::max(query.size(3), value.size(3)); + + auto launchKernel = [&](auto _k, auto kernel_fn) { using Kernel = decltype(_k); using scalar_t = typename Kernel::scalar_t; (void)_k; + if (kernel_launched) { + return; + } + // Check if this kernel is compatible + if (Kernel::kMaxK < maxK) { + return; + } + if (use_dropout && !Kernel::kApplyDropout) { + return; + } + // Alignment + if ((query.stride(2) % Kernel::kMinimumAlignment) || + (key.stride(2) % Kernel::kMinimumAlignment) || + (value.stride(2) % Kernel::kMinimumAlignment)) { + return; + } + + kernel_launched = true; size_t smem_bytes = sizeof(typename Kernel::SharedStorage); // TODO: Fuse this into a kernel? @@ -290,14 +254,16 @@ mem_efficient_attention_backward_cutlass( } Kernel::check_supported(p); - constexpr auto kernel_fn = attention_kernel_backward_batched; - if (smem_bytes > 0xc000) { - TORCH_INTERNAL_ASSERT( - computeCapability >= 70, - "This kernel requires too much shared memory on this machine!"); - AT_CUDA_CHECK(cudaFuncSetAttribute( - kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes)); + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability + auto err = cudaFuncSetAttribute( + kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes); + XFORMERS_CHECK( + err != cudaErrorInvalidValue, + "This GPU does not have enough shared-memory (kernel requires ", + smem_bytes / 1024, + " kb)"); + AT_CUDA_CHECK(err); } // second syntax resulted in the error below on windows @@ -323,13 +289,17 @@ mem_efficient_attention_backward_cutlass( kernel_fn<<>>(p); }; - DISPATCH_KERNEL(query, key, value, use_dropout, ([&] { - launchKernel(Kernel{}, computeCapability); - })); + cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); + const int computeCapability = p->major * 10 + p->minor; + + DISPATCH_TYPES(query, ([&]() { + dispatch_cutlassB(launchKernel, computeCapability); + })); + TORCH_CHECK(kernel_launched, "cutlassB: no kernel found to launch!"); AT_CUDA_CHECK(cudaGetLastError()); return std::make_tuple(grad_q, grad_k, grad_v, grad_bias); #endif -} // namespace +} } // namespace diff --git a/xformers/csrc/attention/cuda/fmha/attention_forward_generic.cu b/xformers/csrc/attention/cuda/fmha/attention_forward_generic.cu index 7fecc4e47e..9119ed73db 100644 --- a/xformers/csrc/attention/cuda/fmha/attention_forward_generic.cu +++ b/xformers/csrc/attention/cuda/fmha/attention_forward_generic.cu @@ -13,69 +13,9 @@ #include #include "kernel_forward.h" +#include "kernels/cutlassF.h" #include "pytorch_utils.h" -#define DISPATCH_BLOCKSIZE(VALUE_HEAD_DIM, FN) \ - { \ - if (VALUE_HEAD_DIM <= 64) { \ - constexpr bool kIs64x64 = true; \ - constexpr bool kSingleValueIteration = true; \ - FN(); \ - } else { \ - constexpr bool kIs64x64 = false; \ - if (VALUE_HEAD_DIM <= 128) { \ - constexpr bool kSingleValueIteration = true; \ - FN(); \ - } else { \ - constexpr bool kSingleValueIteration = false; \ - FN(); \ - } \ - } \ - } - -#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC) \ - { \ - cudaDeviceProp* properties = \ - at::cuda::getDeviceProperties(QUERY.device().index()); \ - const int computeCapability = properties->major * 10 + properties->minor; \ - DISPATCH_BLOCKSIZE( \ - VALUE.size(-1), ([&]() { \ - static constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32; \ - static constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128; \ - DISPATCH_TYPES( \ - QUERY, ([&]() { \ - DISPATCH_ARCHTAG( \ - computeCapability, ([&]() { \ - using AlignedAK = AttentionKernel< \ - scalar_t, \ - ArchTag, \ - true, \ - kQueriesPerBlock, \ - kKeysPerBlock, \ - kSingleValueIteration>; \ - /* Run a more efficient kernel (with `isAligned=True`) \ - if memory is correctly aligned*/ \ - bool isAligned = \ - (QUERY.stride(2) % AlignedAK::kAlignmentQ == 0 && \ - KEY.stride(2) % AlignedAK::kAlignmentK == 0 && \ - VALUE.stride(2) % AlignedAK::kAlignmentV == 0); \ - /* TODO: Should we warn or log somewhere when we use a \ - less efficient kernel due to wrong alignment? */ \ - DISPATCH_BOOL(isAligned, kIsAligned, ([&]() { \ - using Kernel = AttentionKernel< \ - scalar_t, \ - ArchTag, \ - kIsAligned, \ - kQueriesPerBlock, \ - kKeysPerBlock, \ - kSingleValueIteration>; \ - FUNC(); \ - })) \ - })) \ - })); \ - })); \ - } - namespace { template struct TypeTraits; @@ -225,11 +165,34 @@ efficient_attention_forward_cutlass( rng_engine_inputs = gen->philox_cuda_state(B * num_heads * M * N); } - auto launchKernel = [&](auto _k, int computeCapability) { + bool kernel_launched = false; + auto launchKernel = [&](auto _k, auto kernel_fn) { using Kernel = decltype(_k); using scalar_t = typename Kernel::scalar_t; (void)_k; + if (kernel_launched) { + return; + } + // Check if this kernel is compatible + if (!Kernel::kSupportsDropout && use_dropout) { + return; + } + if (!Kernel::kSupportsBias && bias.has_value()) { + return; + } + if (Kernel::kSingleValueIteration && + Kernel::kKeysPerBlock < value.size(3)) { + return; + } + // Alignment + if ((query.stride(2) % Kernel::kAlignmentQ) || + (key.stride(2) % Kernel::kAlignmentK) || + (value.stride(2) % Kernel::kAlignmentV)) { + return; + } + kernel_launched = true; + res = at::empty( {B, M, num_heads, Kv}, query.options().dtype( @@ -311,23 +274,29 @@ efficient_attention_forward_cutlass( p.dropout_prob = dropout_p; } - constexpr auto kernel_fn = attention_kernel_batched; size_t smem_bytes = sizeof(typename Kernel::SharedStorage); if (smem_bytes > 0xc000) { - TORCH_INTERNAL_ASSERT( - computeCapability >= 70, - "This kernel requires too much shared memory on this machine!"); - AT_CUDA_CHECK(cudaFuncSetAttribute( - kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes)); + auto err = cudaFuncSetAttribute( + kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes); + XFORMERS_CHECK( + err != cudaErrorInvalidValue, + "This GPU does not have enough shared-memory (kernel requires ", + smem_bytes / 1024, + " kb)"); + AT_CUDA_CHECK(err); } Kernel::check_supported(p); kernel_fn<<>>(p); }; + // Dispatch to the right kernel - DISPATCH_KERNEL(query, key, value, ([&]() { - launchKernel(Kernel{}, computeCapability); - })); + cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index()); + const int computeCapability = p->major * 10 + p->minor; + DISPATCH_TYPES(query, ([&]() { + dispatch_cutlassF(launchKernel, computeCapability); + })); + TORCH_CHECK(kernel_launched, "cutlassF: no kernel found to launch!"); AT_CUDA_CHECK(cudaGetLastError()); // uint64_t -> int64_t bitwise casting as PyTorch don't support uint64_t diff --git a/xformers/csrc/attention/cuda/fmha/gemm/find_default_mma.h b/xformers/csrc/attention/cuda/fmha/gemm/find_default_mma.h index 387bece3e7..435e994a0f 100644 --- a/xformers/csrc/attention/cuda/fmha/gemm/find_default_mma.h +++ b/xformers/csrc/attention/cuda/fmha/gemm/find_default_mma.h @@ -11,6 +11,8 @@ This is really only for the FastF32 case - aka using TensorCores with fp32. */ +#pragma once + #include "cutlass/gemm/threadblock/default_mma.h" #include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" diff --git a/xformers/csrc/attention/cuda/fmha/kernel_backward.h b/xformers/csrc/attention/cuda/fmha/kernel_backward.h index e0c03e4aaf..74cf21d936 100644 --- a/xformers/csrc/attention/cuda/fmha/kernel_backward.h +++ b/xformers/csrc/attention/cuda/fmha/kernel_backward.h @@ -160,9 +160,9 @@ template < // run optimized kernel because memory accesses will be aligned bool kIsAligned_, // use dropout if enabled - bool kApplyDropout, + bool kApplyDropout_, // upperbound on `max(value.shape[-1], query.shape[-1])` - int kMaxK = std::numeric_limits::max()> + int kMaxK_ = std::numeric_limits::max()> struct AttentionBackwardKernel { using scalar_t = scalar_t_; using output_t = scalar_t; @@ -171,6 +171,8 @@ struct AttentionBackwardKernel { using accum_t = float; using ArchTag = ArchTag_; static constexpr bool kIsAligned = kIsAligned_; + static constexpr bool kApplyDropout = kApplyDropout_; + static constexpr int kMaxK = kMaxK_; struct Params { // Input tensors @@ -263,7 +265,7 @@ struct AttentionBackwardKernel { int64_t gV_strideH; int64_t gB_strideH; - CUTLASS_DEVICE void advance_to_block() { + CUTLASS_DEVICE bool advance_to_block() { int64_t batch_id = blockIdx.z; int32_t head_id = blockIdx.y; @@ -325,6 +327,8 @@ struct AttentionBackwardKernel { } else { workspace = nullptr; } + + return true; } __host__ dim3 getBlocksGrid() const { @@ -1041,7 +1045,7 @@ struct AttentionBackwardKernel { return true; } - static CUTLASS_DEVICE void kernel(Params const& p) { + static CUTLASS_DEVICE void attention_kernel(Params const& p) { extern __shared__ char smem_buffer[]; SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); @@ -2084,7 +2088,9 @@ struct AttentionBackwardKernel { template __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) attention_kernel_backward_batched_impl(typename AK::Params p) { - p.advance_to_block(); + if (!p.advance_to_block()) { + return; + } AK::attention_kernel(p); } diff --git a/xformers/csrc/attention/cuda/fmha/kernel_forward.h b/xformers/csrc/attention/cuda/fmha/kernel_forward.h index d49254e653..a786ec14aa 100644 --- a/xformers/csrc/attention/cuda/fmha/kernel_forward.h +++ b/xformers/csrc/attention/cuda/fmha/kernel_forward.h @@ -1,3 +1,5 @@ +#pragma once + #ifdef HAS_PYTORCH #include #include @@ -67,12 +69,12 @@ template < // If Q/K/V are correctly aligned in memory and we can run a fast kernel bool isAligned_, int kQueriesPerBlock, - int kKeysPerBlock, - bool kSingleValueIteration, // = `value.shape[-1] <= kKeysPerBlock` + int kKeysPerBlock_, + bool kSingleValueIteration_, // = `value.shape[-1] <= kKeysPerBlock` // This is quite slower on V100 for some reason // Set to false if you know at compile-time you will never need dropout - bool kSupportsDropout = true, - bool kSupportsBias = true> + bool kSupportsDropout_ = true, + bool kSupportsBias_ = true> struct AttentionKernel { using scalar_t = scalar_t_; using accum_t = float; @@ -82,7 +84,11 @@ struct AttentionKernel { // Using `accum_t` improves perf on f16 at the cost of // numerical errors using output_accum_t = accum_t; + static constexpr bool kSupportsDropout = kSupportsDropout_; + static constexpr bool kSupportsBias = kSupportsBias_; + static constexpr int kKeysPerBlock = kKeysPerBlock_; static constexpr bool kIsAligned = isAligned_; + static constexpr bool kSingleValueIteration = kSingleValueIteration_; static constexpr int32_t kAlignLSE = 32; // block size of backward static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 && cutlass::sizeof_bits::value == 16; diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward.h b/xformers/csrc/attention/cuda/fmha/kernels/backward.h deleted file mode 100644 index be59536f57..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward.h +++ /dev/null @@ -1,68 +0,0 @@ -#pragma once - -// All kernels are disabled by default -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD_DISABLED(50, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD_DISABLED(70, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD_DISABLED(75, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD_DISABLED(80, __VA_ARGS__) - -#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD -#include "../kernel_backward.h" - -#define _ATTENTION_KERNEL_BACKWARD_BEGIN(...) \ - template <> \ - __global__ void __launch_bounds__( \ - __VA_ARGS__::kNumThreads, __VA_ARGS__::kMinBlocksPerSm) \ - attention_kernel_backward_batched<__VA_ARGS__>( \ - typename __VA_ARGS__::Params p) { \ - using Kernel = __VA_ARGS__; -#define _ATTENTION_KERNEL_BACKWARD_END() } - -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD(ARCH, ...) \ - _ATTENTION_KERNEL_BACKWARD_BEGIN( \ - AttentionBackwardKernel) \ - p.advance_to_block(); \ - Kernel::kernel(p); \ - _ATTENTION_KERNEL_BACKWARD_END(); - -#ifdef __CUDA_ARCH__ -#define __CUDA_ARCH_OR_ZERO__ __CUDA_ARCH__ -#else -#define __CUDA_ARCH_OR_ZERO__ 0 -#endif - -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_DISABLED(ARCH, ...) \ - _ATTENTION_KERNEL_BACKWARD_BEGIN( \ - AttentionBackwardKernel) \ - printf( \ - "FATAL: this function is for sm%d, but was built with __CUDA_ARCH__=%d\n", \ - int(ARCH), \ - int(__CUDA_ARCH_OR_ZERO__)); \ - _ATTENTION_KERNEL_BACKWARD_END(); - -// Enable the right one based on __CUDA_ARCH__ -#ifndef __CUDA_ARCH__ -#elif __CUDA_ARCH__ < 500 -#error "Need cuda arch at least 5.0" -#elif __CUDA_ARCH__ < 700 -#undef INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50 -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD(50, __VA_ARGS__) -#elif __CUDA_ARCH__ < 750 -#undef INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70 -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD(70, __VA_ARGS__) -#elif __CUDA_ARCH__ < 800 -#undef INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75 -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD(75, __VA_ARGS__) -#elif __CUDA_ARCH__ >= 800 -#undef INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80 -#define INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(...) \ - INSTANTIATE_ATTENTION_KERNEL_BACKWARD(80, __VA_ARGS__) -#endif -#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16.cu deleted file mode 100644 index b3e2788ed9..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::bfloat16_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::bfloat16_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::bfloat16_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::bfloat16_t, false, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned.cu deleted file mode 100644 index c8e2146181..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::bfloat16_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::bfloat16_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::bfloat16_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::bfloat16_t, true, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout.cu deleted file mode 100644 index d894c8ddcf..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::bfloat16_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::bfloat16_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::bfloat16_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::bfloat16_t, true, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k128.cu deleted file mode 100644 index 5d88becfd7..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k128.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - true, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - true, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - true, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - true, - true, - 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k64.cu deleted file mode 100644 index dc2eada660..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_dropout_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::bfloat16_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::bfloat16_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::bfloat16_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::bfloat16_t, true, true, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k128.cu deleted file mode 100644 index 753b410b23..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k128.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - true, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - true, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - true, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - true, - false, - 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k64.cu deleted file mode 100644 index 30dbf897d2..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_aligned_k64.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - true, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - true, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - true, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - true, - false, - 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout.cu deleted file mode 100644 index 6846be1d3c..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::bfloat16_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::bfloat16_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::bfloat16_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::bfloat16_t, false, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k128.cu deleted file mode 100644 index 51d8f507e9..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k128.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - false, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - false, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - false, - true, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - false, - true, - 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k64.cu deleted file mode 100644 index d8d6db9ccb..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_dropout_k64.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - false, - true, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - false, - true, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - false, - true, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - false, - true, - 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k128.cu deleted file mode 100644 index b7de6e20f8..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k128.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - false, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - false, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - false, - false, - 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - false, - false, - 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k64.cu deleted file mode 100644 index 2e8a1e3586..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_bf16_k64.cu +++ /dev/null @@ -1,22 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50( - cutlass::bfloat16_t, - false, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70( - cutlass::bfloat16_t, - false, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75( - cutlass::bfloat16_t, - false, - false, - 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80( - cutlass::bfloat16_t, - false, - false, - 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16.cu deleted file mode 100644 index 436d6e70ef..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned.cu deleted file mode 100644 index ec07a70b90..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout.cu deleted file mode 100644 index 46a018ce7d..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k128.cu deleted file mode 100644 index 796b592cff..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, true, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k64.cu deleted file mode 100644 index 04348458d0..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_dropout_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, true, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k128.cu deleted file mode 100644 index e7b6c0fedc..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, false, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k64.cu deleted file mode 100644 index dca706d51b..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_aligned_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, true, false, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout.cu deleted file mode 100644 index 8212bc91e2..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k128.cu deleted file mode 100644 index 55bf42cb96..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, true, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k64.cu deleted file mode 100644 index 6462ce1419..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_dropout_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, true, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k128.cu deleted file mode 100644 index 4527a8e8fe..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, false, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k64.cu deleted file mode 100644 index 8f94d9f935..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f16_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(cutlass::half_t, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(cutlass::half_t, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(cutlass::half_t, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(cutlass::half_t, false, false, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32.cu deleted file mode 100644 index bf25e3127a..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned.cu deleted file mode 100644 index 22c3412b62..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, false); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, false); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout.cu deleted file mode 100644 index 3fbe4860cc..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k128.cu deleted file mode 100644 index 0096a0a429..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, true, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k64.cu deleted file mode 100644 index 9c5f8e051f..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_dropout_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, true, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k128.cu deleted file mode 100644 index 8364faf2c2..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, false, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k64.cu deleted file mode 100644 index 8b06e82557..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_aligned_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, true, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, true, false, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout.cu deleted file mode 100644 index e41474c765..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, true); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k128.cu deleted file mode 100644 index f98faec8ea..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, true, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, true, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k64.cu deleted file mode 100644 index 5246f6240a..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_dropout_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, true, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, true, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k128.cu deleted file mode 100644 index c743b1c1d0..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k128.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, false, 128); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, false, 128); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k64.cu deleted file mode 100644 index f9e23eb47e..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/backward_f32_k64.cu +++ /dev/null @@ -1,6 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM50(float, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM70(float, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM75(float, false, false, 64); -INSTANTIATE_ATTENTION_KERNEL_BACKWARD_SM80(float, false, false, 64); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h new file mode 100644 index 0000000000..1818422fd4 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB.h @@ -0,0 +1,803 @@ +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +#pragma once +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// ======== f16 / sm50 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm50(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm50); +} + +// ======== f32 / sm50 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm50(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm50); +} + +// ======== f16 / sm70 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm70(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm70); +} + +// ======== f32 / sm70 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm70(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm70); +} + +// ======== f16 / sm75 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm75(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm75); +} + +// ======== f32 / sm75 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm75(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm75); +} + +// ======== bf16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_bf16_sm80(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80); +} + +// ======== f16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm80(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_k65536_dropout_sm80); +} + +// ======== f32 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm80(T cb) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_k65536_dropout_sm80); +} + + +template +void dispatch_cutlassB(T cb, int cc = 0) { + + if (std::is_same::value && 50 <= cc && cc < 70) { + dispatch_cutlassB_f16_sm50(cb); + } + if (std::is_same::value && 50 <= cc && cc < 70) { + dispatch_cutlassB_f32_sm50(cb); + } + if (std::is_same::value && 70 <= cc && cc < 75) { + dispatch_cutlassB_f16_sm70(cb); + } + if (std::is_same::value && 70 <= cc && cc < 75) { + dispatch_cutlassB_f32_sm70(cb); + } + if (std::is_same::value && 75 <= cc && cc < 80) { + dispatch_cutlassB_f16_sm75(cb); + } + if (std::is_same::value && 75 <= cc && cc < 80) { + dispatch_cutlassB_f32_sm75(cb); + } + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassB_bf16_sm80(cb); + } + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassB_f16_sm80(cb); + } + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassB_f32_sm80(cb); + } +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128.cu new file mode 100644 index 0000000000..21240a54dd --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128_dropout.cu new file mode 100644 index 0000000000..8ffe1267a3 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k128_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32.cu new file mode 100644 index 0000000000..daba927738 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32_dropout.cu new file mode 100644 index 0000000000..bb2146c353 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k32_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64.cu new file mode 100644 index 0000000000..743f270374 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64_dropout.cu new file mode 100644 index 0000000000..7585ba1548 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k64_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536.cu new file mode 100644 index 0000000000..91dd4f0350 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536_dropout.cu new file mode 100644 index 0000000000..845775c5bb --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_aligned_k65536_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_aligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128.cu new file mode 100644 index 0000000000..fa7f6d864c --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128_dropout.cu new file mode 100644 index 0000000000..6654053be5 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k128_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32.cu new file mode 100644 index 0000000000..a0eda01817 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32_dropout.cu new file mode 100644 index 0000000000..f3df37d218 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k32_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64.cu new file mode 100644 index 0000000000..6dd19a54ba --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64_dropout.cu new file mode 100644 index 0000000000..a144398ff9 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k64_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536.cu new file mode 100644 index 0000000000..5e32555a62 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536_dropout.cu new file mode 100644 index 0000000000..939883fe60 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_bf16_notaligned_k65536_dropout.cu @@ -0,0 +1,24 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_bf16_notaligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128.cu new file mode 100644 index 0000000000..3739af8454 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128_dropout.cu new file mode 100644 index 0000000000..ba5e4ab85e --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k128_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32.cu new file mode 100644 index 0000000000..cd62cdc18d --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32_dropout.cu new file mode 100644 index 0000000000..5d8e370585 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k32_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64.cu new file mode 100644 index 0000000000..394b7db645 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64_dropout.cu new file mode 100644 index 0000000000..bc3c922802 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k64_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536.cu new file mode 100644 index 0000000000..189ccc48a8 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536_dropout.cu new file mode 100644 index 0000000000..de96301659 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_aligned_k65536_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_aligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu new file mode 100644 index 0000000000..0972ee122c --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu new file mode 100644 index 0000000000..5e2beadf07 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k128_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu new file mode 100644 index 0000000000..cfdad7bbc0 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu new file mode 100644 index 0000000000..6ac12e6523 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k32_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu new file mode 100644 index 0000000000..555de36efb --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu new file mode 100644 index 0000000000..6475941590 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k64_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu new file mode 100644 index 0000000000..167e9ba03a --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu new file mode 100644 index 0000000000..6802f67bff --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f16_notaligned_k65536_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f16_notaligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128.cu new file mode 100644 index 0000000000..44a812094f --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128_dropout.cu new file mode 100644 index 0000000000..6d26f9e8fa --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k128_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32.cu new file mode 100644 index 0000000000..9f87540df8 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32_dropout.cu new file mode 100644 index 0000000000..5d29b17767 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k32_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64.cu new file mode 100644 index 0000000000..e4f2db96f8 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64_dropout.cu new file mode 100644 index 0000000000..300edaa447 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k64_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536.cu new file mode 100644 index 0000000000..6dcb961cfb --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536_dropout.cu new file mode 100644 index 0000000000..17907c6897 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_aligned_k65536_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_aligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu new file mode 100644 index 0000000000..7bea2d16c5 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu new file mode 100644 index 0000000000..58a69bfa47 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k128_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k128_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k128_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu new file mode 100644 index 0000000000..d8651a07f0 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu new file mode 100644 index 0000000000..1e0f58b639 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k32_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k32_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k32_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu new file mode 100644 index 0000000000..0d66ac6deb --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu new file mode 100644 index 0000000000..aeebeb5707 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k64_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k64_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k64_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu new file mode 100644 index 0000000000..bb6195c01f --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu new file mode 100644 index 0000000000..3eba4d492d --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassB_f32_notaligned_k65536_dropout.cu @@ -0,0 +1,81 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_backward.h" + +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_dropout_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_dropout_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_dropout_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionBackwardKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassB_f32_notaligned_k65536_dropout_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h new file mode 100644 index 0000000000..b2d086bf7c --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF.h @@ -0,0 +1,353 @@ +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +#pragma once +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// ======== bf16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_bf16_sm80(T cb) { + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_32x128_gmem_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80); +} + +// ======== f16 / sm50 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm50(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm50); +} + +// ======== f16 / sm70 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm70(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm70); +} + +// ======== f16 / sm75 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm75(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm75); +} + +// ======== f16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm80(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm80); +} + +// ======== f32 / sm50 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm50(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm50); +} + +// ======== f32 / sm70 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm70(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm70); +} + +// ======== f32 / sm75 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm75(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm75); +} + +// ======== f32 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm80(T cb) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm80); +} + + +template +void dispatch_cutlassF(T cb, int cc = 0) { + + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassF_bf16_sm80(cb); + } + if (std::is_same::value && 50 <= cc && cc < 70) { + dispatch_cutlassF_f16_sm50(cb); + } + if (std::is_same::value && 70 <= cc && cc < 75) { + dispatch_cutlassF_f16_sm70(cb); + } + if (std::is_same::value && 75 <= cc && cc < 80) { + dispatch_cutlassF_f16_sm75(cb); + } + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassF_f16_sm80(cb); + } + if (std::is_same::value && 50 <= cc && cc < 70) { + dispatch_cutlassF_f32_sm50(cb); + } + if (std::is_same::value && 70 <= cc && cc < 75) { + dispatch_cutlassF_f32_sm70(cb); + } + if (std::is_same::value && 75 <= cc && cc < 80) { + dispatch_cutlassF_f32_sm75(cb); + } + if (std::is_same::value && 80 <= cc && cc < 90) { + dispatch_cutlassF_f32_sm80(cb); + } +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_aligned.cu new file mode 100644 index 0000000000..b32e91dd5e --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_aligned.cu @@ -0,0 +1,62 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_aligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_aligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_aligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_notaligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_notaligned.cu new file mode 100644 index 0000000000..70e14beee1 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_bf16_notaligned.cu @@ -0,0 +1,62 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_notaligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_notaligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_bf16_notaligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_aligned.cu new file mode 100644 index 0000000000..73f60913fb --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_aligned.cu @@ -0,0 +1,233 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_64x64_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_64x64_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_64x64_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_gmem_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_gmem_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_gmem_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_aligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu new file mode 100644 index 0000000000..27996604ce --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f16_notaligned.cu @@ -0,0 +1,233 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_64x64_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_64x64_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_64x64_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_gmem_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_gmem_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_gmem_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f16_notaligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_aligned.cu new file mode 100644 index 0000000000..7df7ad5a02 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_aligned.cu @@ -0,0 +1,233 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_64x64_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_64x64_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_64x64_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_gmem_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_gmem_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_gmem_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_aligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu new file mode 100644 index 0000000000..f2f29b6d99 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/cutlassF_f32_notaligned.cu @@ -0,0 +1,233 @@ +#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD +// This file is auto-generated. See "generate_kernels.py" +#include "../kernel_forward.h" + +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_64x64_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_64x64_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_64x64_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_64x64_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_rf_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_rf_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_rf_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_rf_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 500 +#if __CUDA_ARCH__ < 700 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_gmem_sm50` is for sm50-sm70, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ < 750 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_gmem_sm70` is for sm70-sm75, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 750 +#if __CUDA_ARCH__ < 800 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_gmem_sm75` is for sm75-sm80, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm80(typename AttentionKernel::Params p) { +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= 800 +#if __CUDA_ARCH__ < 900 + if (!p.advance_to_block()) { + return; + } + AttentionKernel::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `fmha_cutlassF_f32_notaligned_32x128_gmem_sm80` is for sm80-sm90, but was built for sm%d\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +} +#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward.h b/xformers/csrc/attention/cuda/fmha/kernels/forward.h deleted file mode 100644 index be33d82867..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward.h +++ /dev/null @@ -1,92 +0,0 @@ -#pragma once - -// All kernels are disabled by default -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__) -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__) - -#ifndef XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD -#include "../kernel_forward.h" - -#define _ATTENTION_KERNEL_FORWARD_BEGIN(...) \ - template <> \ - __global__ void __launch_bounds__( \ - __VA_ARGS__::kNumThreads, __VA_ARGS__::kMinBlocksPerSm) \ - attention_kernel_batched<__VA_ARGS__>(typename __VA_ARGS__::Params p) { \ - using Kernel = __VA_ARGS__; -#define _ATTENTION_KERNEL_FORWARD_END() } - -#ifdef __CUDA_ARCH__ -#define __CUDA_ARCH_OR_ZERO__ __CUDA_ARCH__ -#else -#define __CUDA_ARCH_OR_ZERO__ 0 -#endif - -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD( \ - ARCH, \ - SCALAR_T, \ - IS_ALIGNED, \ - QUERIES_PER_BLOCK, \ - KEYS_PER_BLOCK, \ - SINGLE_VALUE_ITER) \ - _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel< \ - SCALAR_T, \ - cutlass::arch::Sm##ARCH, \ - IS_ALIGNED, \ - QUERIES_PER_BLOCK, \ - KEYS_PER_BLOCK, \ - SINGLE_VALUE_ITER>) \ - if (!p.advance_to_block()) { \ - return; \ - } \ - Kernel::attention_kernel(p); \ - _ATTENTION_KERNEL_FORWARD_END(); - -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED( \ - ARCH, \ - SCALAR_T, \ - IS_ALIGNED, \ - QUERIES_PER_BLOCK, \ - KEYS_PER_BLOCK, \ - SINGLE_VALUE_ITER) \ - _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel< \ - SCALAR_T, \ - cutlass::arch::Sm##ARCH, \ - IS_ALIGNED, \ - QUERIES_PER_BLOCK, \ - KEYS_PER_BLOCK, \ - SINGLE_VALUE_ITER>) \ - printf( \ - "FATAL: this function is for sm%d, but was built for sm%d\n", \ - int(ARCH), \ - int(__CUDA_ARCH_OR_ZERO__)); \ - _ATTENTION_KERNEL_FORWARD_END(); - -// Enable the right one based on __CUDA_ARCH__ -#ifndef __CUDA_ARCH__ -#elif __CUDA_ARCH__ < 500 -#error "Need cuda arch at least 5.0" -#elif __CUDA_ARCH__ < 700 -#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50 -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__) -#elif __CUDA_ARCH__ < 750 -#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70 -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__) -#elif __CUDA_ARCH__ < 800 -#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75 -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__) -#elif __CUDA_ARCH__ >= 800 -#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80 -#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \ - INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__) -#endif - -#endif // XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16.cu deleted file mode 100644 index b662137fc0..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16.cu +++ /dev/null @@ -1,74 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - false, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - false, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - false, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - false, - 64, - 64, - true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16_aligned.cu deleted file mode 100644 index 0d0d24d3ec..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_bf16_aligned.cu +++ /dev/null @@ -1,74 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - true, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::bfloat16_t, - true, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - true, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::bfloat16_t, - true, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - true, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::bfloat16_t, - true, - 64, - 64, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - true, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::bfloat16_t, - true, - 64, - 64, - true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_f16.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_f16.cu deleted file mode 100644 index 6059141d8a..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_f16.cu +++ /dev/null @@ -1,54 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::half_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::half_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(cutlass::half_t, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::half_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::half_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(cutlass::half_t, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::half_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::half_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(cutlass::half_t, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::half_t, - false, - 32, - 128, - true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::half_t, - false, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(cutlass::half_t, false, 64, 64, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_f16_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_f16_aligned.cu deleted file mode 100644 index 1f2f795a2d..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_f16_aligned.cu +++ /dev/null @@ -1,34 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(cutlass::half_t, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50( - cutlass::half_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(cutlass::half_t, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(cutlass::half_t, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70( - cutlass::half_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(cutlass::half_t, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(cutlass::half_t, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75( - cutlass::half_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(cutlass::half_t, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(cutlass::half_t, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80( - cutlass::half_t, - true, - 32, - 128, - false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(cutlass::half_t, true, 64, 64, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_f32.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_f32.cu deleted file mode 100644 index 5d11448d9d..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_f32.cu +++ /dev/null @@ -1,14 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, false, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, false, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, false, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, false, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, false, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, false, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, false, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, false, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, false, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, false, 64, 64, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/forward_f32_aligned.cu b/xformers/csrc/attention/cuda/fmha/kernels/forward_f32_aligned.cu deleted file mode 100644 index c97f125606..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/forward_f32_aligned.cu +++ /dev/null @@ -1,14 +0,0 @@ -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, true, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(float, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, true, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(float, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, true, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(float, true, 64, 64, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, true, 32, 128, true); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, true, 32, 128, false); -INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(float, true, 64, 64, true); diff --git a/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py b/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py new file mode 100644 index 0000000000..44358f7533 --- /dev/null +++ b/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.py @@ -0,0 +1,299 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +# Generates combination of kernels - implementations and registry + +# Kernels are ordered (see `sort_index`), and when dispatching, +# we select the first kernel in the list that supports the inputs + +import collections +import itertools +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Tuple, TypeVar + +DTYPES = { + "f32": "float", + "f16": "cutlass::half_t", + "bf16": "cutlass::bfloat16_t", +} + +SM = [50, 70, 75, 80] + +KERNEL_IMPL_TEMPLATE = """__global__ void __launch_bounds__( + {CPP_CLASS}::kNumThreads, + {CPP_CLASS}::kMinBlocksPerSm) +{NAME}(typename {CPP_CLASS}::Params p) {{ +#ifdef __CUDA_ARCH__ +#if __CUDA_ARCH__ >= {SM}0 +#if __CUDA_ARCH__ < {SM_MAX}0 + if (!p.advance_to_block()) {{ + return; + }} + {CPP_CLASS}::attention_kernel(p); + return; +#endif +#endif + printf( + "FATAL: kernel `{NAME}` is for sm{SM}-sm{SM_MAX}, but was built for sm%d\\n", + int(__CUDA_ARCH__ + 0) / 10); +#endif +}} +""" + + +@dataclass(order=True) +class FwdKernel: + sort_index: Tuple[int, ...] = field(init=False, repr=False) + aligned: bool + dtype: str + sm: int + sm_max: int + q: int + k: int + single_value_iter: bool + supports_dropout: bool = True + supports_bias: bool = True + + def __post_init__(self) -> None: + # Set kernel selection priority + # The lowest value that matches inputs + # will be selected + self.sort_index = ( + # First select aligned kernel + 0 if self.aligned else 1, + # Then keep output in RF + 0 if self.single_value_iter else 1, + self.k, + # Prefer kernels without dropout/bias if available + 1 if self.supports_dropout else 0, + 1 if self.supports_bias else 0, + ) + + @property + def _aligned_suffix(self) -> str: + return "aligned" if self.aligned else "notaligned" + + @property + def name(self) -> str: + acc = "rf" if self.single_value_iter else "gmem" + return f"fmha_cutlassF_{self.dtype}_{self._aligned_suffix}_{self.q}x{self.k}_{acc}_sm{self.sm}" + + @property + def cpp_class(self) -> str: + template_args = ", ".join( + [ + DTYPES[self.dtype], + f"cutlass::arch::Sm{self.sm}", + "true" if self.aligned else "false", + str(self.q), + str(self.k), + "true" if self.single_value_iter else "false", + "true" if self.supports_dropout else "false", + "true" if self.supports_bias else "false", + ] + ) + return f"AttentionKernel<{template_args}>" + + @property + def impl_group(self) -> str: + # Maps to file which will contain the implementation + return f"{self.dtype}_{self._aligned_suffix}" + + @property + def cpp_impl(self) -> str: + return KERNEL_IMPL_TEMPLATE.format( + CPP_CLASS=self.cpp_class, + NAME=self.name, + SM=self.sm, + SM_MAX=self.sm_max, + ) + + @classmethod + def get_all(cls) -> List["FwdKernel"]: + kernels: List[FwdKernel] = [] + for aligned, dtype, (sm, sm_max) in itertools.product( + [True, False], DTYPES.keys(), zip(SM, SM[1:] + [90]) + ): + # Remove some kernels we don't use + if dtype == "bf16" and sm < 80: + continue + for q, k, single_value_iter in [ + (32, 128, True), + (32, 128, False), + (64, 64, True), + ]: + kernels.append( + cls( + aligned=aligned, + dtype=dtype, + sm=sm, + sm_max=sm_max, + q=q, + k=k, + single_value_iter=single_value_iter, + ) + ) + return kernels + + +@dataclass(order=True) +class BwdKernel: + sort_index: Tuple[int, ...] = field(init=False, repr=False) + sm: int + sm_max: int + dtype: str + aligned: bool + apply_dropout: bool + max_k: int + + def __post_init__(self) -> None: + # Set kernel selection priority + # The lowest value that matches inputs + # will be selected + self.sort_index = ( + # First select aligned kernel + 0 if self.aligned else 1, + # Take a kernel without dropout if possible + 1 if self.apply_dropout else 0, + # Then take the smallest maxK + self.max_k, + ) + + @property + def _aligned_suffix(self) -> str: + return "aligned" if self.aligned else "notaligned" + + @property + def name(self) -> str: + dropout_suffix = "_dropout" if self.apply_dropout else "" + return f"fmha_cutlassB_{self.dtype}_{self._aligned_suffix}_k{self.max_k}{dropout_suffix}_sm{self.sm}" + + @property + def cpp_class(self) -> str: + template_args = ", ".join( + [ + f"cutlass::arch::Sm{self.sm}", + DTYPES[self.dtype], + "true" if self.aligned else "false", + "true" if self.apply_dropout else "false", + str(self.max_k), + ] + ) + return f"AttentionBackwardKernel<{template_args}>" + + @property + def impl_group(self) -> str: + # Maps to file which will contain the implementation + dropout_suffix = "_dropout" if self.apply_dropout else "" + return f"{self.dtype}_{self._aligned_suffix}_k{self.max_k}{dropout_suffix}" + + @property + def cpp_impl(self) -> str: + return KERNEL_IMPL_TEMPLATE.format( + CPP_CLASS=self.cpp_class, + NAME=self.name, + SM=self.sm, + SM_MAX=self.sm_max, + ) + + @classmethod + def get_all(cls) -> List["BwdKernel"]: + kernels: List[BwdKernel] = [] + for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product( + [True, False], + DTYPES.keys(), + zip(SM, SM[1:] + [90]), + [True, False], + [32, 64, 128, 2**16], + ): + if dtype == "bf16" and sm < 80: + continue + kernels.append( + cls( + aligned=aligned, + dtype=dtype, + sm=sm, + sm_max=sm_max, + apply_dropout=apply_dropout, + max_k=max_k, + ) + ) + return kernels + + +T = TypeVar("T", FwdKernel, BwdKernel) + + +def write_decl_impl( + kernels: List[T], family_name: str, impl_file: str, disable_def: str +) -> None: + cpp_file_header = f"""// This file is auto-generated. See "generate_kernels.py" +#include "{impl_file}" + +""" + + kernels.sort() + + implfile_to_kernels: Dict[str, List[T]] = collections.defaultdict(list) + cat_to_kernels: Dict[Tuple[str, int, int], List[T]] = collections.defaultdict(list) + + dispatch_all = "" + declarations = cpp_file_header + "#pragma once\n" + declarations += f"#ifndef {disable_def}\n" + + # Declaration of kernel functions + for k in kernels: + implfile_to_kernels[k.impl_group].append(k) + cat_to_kernels[(k.dtype, k.sm, k.sm_max)].append(k) + + for (cat_dt, cat_sm, cat_sm_max), kernels in cat_to_kernels.items(): + declarations += f"// ======== {cat_dt} / sm{cat_sm} ========\n" + declarations += "\n".join( + k.cpp_impl.split("{")[0].rstrip() + ";" for k in kernels + ) + dispatch_category_fn = f"dispatch_{family_name}_{cat_dt}_sm{cat_sm}" + declarations += ( + f"\n\ntemplate void {dispatch_category_fn}(T cb) {{\n" + ) + declarations += "\n".join( + f" cb({k.cpp_class}(), {k.name});" for k in kernels + ) + declarations += "\n}\n" + declarations += "\n" + dispatch_all += f""" + if (std::is_same::value && {cat_sm} <= cc && cc < {cat_sm_max}) {{ + {dispatch_category_fn}(cb); + }}""" + + declarations += f""" +template +void dispatch_{family_name}(T cb, int cc = 0) {{ +{dispatch_all} +}} +""" + declarations += f"#endif // {disable_def}\n" + Path(f"{family_name}.h").write_text(declarations) + + for f, f_kernels in implfile_to_kernels.items(): + impl_cu = f"#ifndef {disable_def}\n{cpp_file_header}" + for k in f_kernels: + impl_cu += k.cpp_impl + impl_cu += f"#endif // {disable_def}\n" + Path(f"{family_name}_{f}.cu").write_text(impl_cu) + + +write_decl_impl( + FwdKernel.get_all(), + "cutlassF", + impl_file="../kernel_forward.h", + disable_def="XFORMERS_MEM_EFF_ATTENTION_DISABLE_FORWARD", +) +write_decl_impl( + BwdKernel.get_all(), + "cutlassB", + impl_file="../kernel_backward.h", + disable_def="XFORMERS_MEM_EFF_ATTENTION_DISABLE_BACKWARD", +) diff --git a/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.sh b/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.sh deleted file mode 100755 index 3bffffc7de..0000000000 --- a/xformers/csrc/attention/cuda/fmha/kernels/generate_kernels.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -set -ex -rm -f forward_*.cu backward_*.cu -IFS="," - -# BACKWARD -kernel="BACKWARD" -kernel_lower=`echo "\$kernel" | awk '{print tolower($0)}'` -for enable_dropout in "false" "true"; do - for aligned in "false" "true"; do - for maxk in 64 128 ""; do - for dtype_name in "f32" "f16" "bf16"; do - case "$dtype_name" in - "f32") dtype="float" ;; - "f16") dtype="cutlass::half_t" ;; - "bf16") dtype="cutlass::bfloat16_t" ;; - esac - [[ $aligned = "true" ]] && s="_aligned" || s="" - [[ $enable_dropout = "true" ]] && s="${s}_dropout" || s="${s}" - [[ $maxk = "" ]] && s="${s}" || s="${s}_k$maxk" - [[ $maxk = "" ]] && maxk_code="" || maxk_code=", $maxk" - FNAME="${kernel_lower}_${dtype_name}${s}.cu" - echo $FNAME - cat < $FNAME -// This file is auto-generated. See "generate_kernels.sh" -#include "backward.h" -EOF - for sm in 50 70 75 80; do - echo "INSTANTIATE_ATTENTION_KERNEL_${kernel}_SM${sm}($dtype, $aligned, $enable_dropout$maxk_code);" >> $FNAME - done; - done; - done; - done; -done; - -# FORWARD -kernel="FORWARD" -kernel_lower=`echo "\$kernel" | awk '{print tolower($0)}'` -for aligned in "false" "true"; do - [[ $aligned = "true" ]] && aligned_suffix="_aligned" || aligned_suffix="" - for dtype_name in "f32" "f16" "bf16"; do - case "$dtype_name" in - "f32") dtype="float" ;; - "f16") dtype="cutlass::half_t" ;; - "bf16") dtype="cutlass::bfloat16_t" ;; - esac - FNAME="${kernel_lower}_${dtype_name}${aligned_suffix}.cu" - echo $FNAME - cat < $FNAME -// This file is auto-generated. See "generate_kernels.sh" -#include "forward.h" -EOF - for sm in 50 70 75 80; do - echo "INSTANTIATE_ATTENTION_KERNEL_${kernel}_SM${sm}($dtype, $aligned, 32, 128, true);" >> $FNAME - echo "INSTANTIATE_ATTENTION_KERNEL_${kernel}_SM${sm}($dtype, $aligned, 32, 128, false);" >> $FNAME - echo "INSTANTIATE_ATTENTION_KERNEL_${kernel}_SM${sm}($dtype, $aligned, 64, 64, true);" >> $FNAME - done; - done; -done diff --git a/xformers/csrc/attention/cuda/fmha/pytorch_utils.h b/xformers/csrc/attention/cuda/fmha/pytorch_utils.h index b0ec5e9705..d3d9e9971b 100644 --- a/xformers/csrc/attention/cuda/fmha/pytorch_utils.h +++ b/xformers/csrc/attention/cuda/fmha/pytorch_utils.h @@ -1,3 +1,5 @@ +#pragma once + #include #include