Less clutter.

huggingface · Jul 22, 2024 · 5829b78 · 5829b78
1 parent 620416f
commit 5829b78
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 5 deletions.
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -757,6 +757,8 @@ def get_model(
                 default_dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
+                # hidden_size / num_attention_heads is wrong in `google/gemma-2-9b-it`
+                head_size=config.head_dim,
             )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -925,11 +925,7 @@ def __init__(
         assert self.num_kv_heads > 0
 
         if head_size is None:
-            if getattr(config, "head_dim", None):
-                # hidden_size / num_attention_heads is wrong in `google/gemma-2-9b-it`
-                self.head_size = config.head_dim
-            else:
-                self.head_size = config.hidden_size // config.num_attention_heads
+            self.head_size = config.hidden_size // config.num_attention_heads
         else:
             self.head_size = head_size