fMHA dispatch: Fix dispatch to Flash when using BlockDiag attn mask

ghstack-source-id: f8a54005dc163203a6fa54495be02f6edcedfa3c Pull Request resolved: https://github.com/fairinternal/xformers/pull/461 __original_commit__ = fairinternal/xformers@471967c9ffc6cfc3205c72237d3013453ace6c93
P2Enjoy · Feb 9, 2023 · 44bc216 · 44bc216
1 parent 1637b24
commit 44bc216
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
@@ -8,12 +8,15 @@
 from typing import List, Type, TypeVar
 
 from . import cutlass, flash, small_k, triton
+from .attn_bias import BlockDiagonalMask
 from .common import AttentionBwOpBase, AttentionFwOpBase, Inputs
 
 
 def _is_cutlass_fwd_faster_than_flash(inp: Inputs) -> bool:
     # Very small batch sizes - if batch size specified
     batch_size, q_len, num_heads, k = inp.query.shape
+    if isinstance(inp.attn_bias, BlockDiagonalMask):
+        batch_size *= len(inp.attn_bias.k_seqinfo.cu_seqlen_py)
     if batch_size > 0:
         threads_flash = batch_size * num_heads
         threads_cutlass = threads_flash * (q_len // 64)