[test] enable flash attention for benchmark by default

hpcaitech · Jun 25, 2024 · a6b602e · a6b602e
1 parent eecf9a0
commit a6b602e
Showing 1 changed file with 7 additions and 2 deletions.
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
@@ -229,8 +229,13 @@ def empty_init():
         init_kwargs["empty_init"] = False
 
     with init_ctx:
-        model = AutoModelForCausalLM.from_config(config, trust_remote_code=True, **init_kwargs)
-
+        model = AutoModelForCausalLM.from_config(
+            config,
+            trust_remote_code=True,
+            **init_kwargs,
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.float16,
+        )
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
         if config.model_type == "chatglm":