[example] reuse flash attn patch (#5400)

ver217 · web-flow · commit d882d18c6544 · 2024-02-27T11:22:07.000+08:00
diff --git a/examples/language/llama2/attn.py b/examples/language/llama2/attn.py
diff --git a/examples/language/llama2/attn.py b/examples/language/llama2/attn.py
@@ -0,0 +1 @@
+../../../applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
@@ -3,7 +3,7 @@
 from contextlib import nullcontext
 
 import torch
-from attn import SUPPORT_FLASH, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
 from performance_evaluator import PerformanceEvaluator
@@ -188,8 +188,7 @@ def empty_init():
         model.gradient_checkpointing_enable()
 
     if args.xformers:
-        assert SUPPORT_FLASH, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
diff --git a/examples/language/llama2/finetune.py b/examples/language/llama2/finetune.py
@@ -9,7 +9,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from attn import SUPPORT_XFORMERS, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import load_json, prepare_dataloader, save_json
 from datasets import load_dataset
 from torch.optim import Optimizer
@@ -219,8 +219,7 @@ def main():
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
     if args.flash_attention:
-        assert SUPPORT_XFORMERS, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py
@@ -8,7 +8,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from attn import SUPPORT_XFORMERS, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import load_json, prepare_dataloader, save_json
 from datasets import load_dataset
 from torch.optim import Optimizer
@@ -238,8 +238,7 @@ def main():
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
     if args.flash_attention:
-        assert SUPPORT_XFORMERS, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py`