Flash Attention is a fast & memory-efficient exact attention algorithm that fuses operations into a single kernel. Tiny Flash Attention is a minimal implementation which expresses the forward-pass in ~20 lines of CUDA code.
@misc{dao2022flashattentionfastmemoryefficientexact,
title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
year={2022},
eprint={2205.14135},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2205.14135},
}