Lightning-AI · Andrei-Aksionov · Nov 13, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -132,6 +132,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                  |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
+| OLMo | 1B, 7B | Allen Institute for AI (AI2) | [OLMo project page](https://allenai.org/olmo)    |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                  |
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |

@@ -149,12 +149,21 @@ def mlp_class(self) -> Type:
     @property
     def norm_class(self) -> Type:
         # `self.norm_class_name` cannot be the type to keep the config serializable
-        if self.norm_class_name == "RMSNorm":
-            from functools import partial
 
+        from functools import partial
+
+        if self.norm_class_name == "RMSNorm":
+
             from litgpt.model import RMSNorm
 
             return partial(RMSNorm, add_unit_offset="Gemma" in self.name)
+
+        if self.norm_class_name == "LayerNorm" and "OLMo" in self.name:
+            # this makes it equivalent to `torch.nn.functional.layer_norm`
+            # that is used by OLMo
+            # Table 5 caption in the OLMo paper shows this - https://aclanthology.org/2024.acl-long.841
+            return partial(torch.nn.LayerNorm, elementwise_affine=False)
+
         return getattr(torch.nn, self.norm_class_name)
 
 
@@ -1027,6 +1036,64 @@ def norm_class(self) -> Type:
         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192)
     ),
 )
+
+#################
+# Allen AI OLMo
+#################
+olmo = [
+    # https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json
+    dict(
+        name="OLMo-1b-hf",
+        hf_config=dict(org="allenai", name="OLMo-1b-hf"),
+        vocab_size=50280,
+        padded_vocab_size=50304,
+        block_size=2048,
+        n_embd=2048,
+        n_layer=16,
+        n_head=16,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="LayerNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=8192,
+    ),
+    # https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json
+    dict(
+        name="OLMo-7b-hf",
+        hf_config=dict(org="allenai", name="OLMo-7b-hf"),
+        vocab_size=50280,
+        padded_vocab_size=50304,
+        block_size=2048,
+        n_layer=32,
+        n_head=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="LayerNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json
+    dict(
+        name="OLMo-7b-Instruct-hf",
+        hf_config=dict(org="allenai", name="OLMo-7b-Instruct-hf"),
+        vocab_size=50280,
+        padded_vocab_size=50304,
+        block_size=2048,
+        n_layer=32,
+        n_head=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        norm_class_name="LayerNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+]
+
+configs.extend(olmo)
+
 ###############
 # Google Gemma
 ###############

@@ -327,6 +327,11 @@ def apply(self, prompt: str, **kwargs: str) -> str:
         return f"<|prompt|>{prompt}</s><|answer|>"
 
 
+class OLMo(PromptStyle):
+    def apply(self, prompt: str, **kwargs: str) -> str:
+        return f"<|endoftext|><|user|>\n{prompt}\n<|assistant|>\n"
+
+
 # Maps prompt style names to PromptStyle classes
 prompt_styles: Dict[str, Type[PromptStyle]] = {
     # Dataset-specific prompt styles
@@ -354,6 +359,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
     "gemma": Gemma,
     "h2oai": H2Oai,
     "llama3": Llama3,
+    "olmo": OLMo,
 }
 
 
@@ -400,6 +406,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Gemma()
     if re.search("Danube2.*-chat", model_name):
         return H2Oai()
+    if re.search(r"OLMo.*-hf", model_name):
+        return OLMo()
     return Default()
 
 

@@ -130,7 +130,10 @@ def encode(
 
         if eos and (not tokens or tokens[-1] != self.eos_id):
             tokens = tokens + [self.eos_id]
-
+        # if the processor misbehaves and adds `eos` token no matter what
+        elif tokens and tokens[-1] == self.eos_id:
+            tokens = tokens[:-1]
+
         if max_length > 0:
             tokens = tokens[:max_length]
         return torch.tensor(tokens, dtype=torch.int, device=device)

@@ -27,6 +27,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/)                                                                     |
 | Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/)                                                                        |
 | Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch)                                                                          |
+| OLMo | 1B, 7B | Allen Institute for AI (AI2) | [ OLMo project page](https://allenai.org/olmo)     |
 | OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                         |
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
 | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
@@ -54,6 +55,9 @@ litgpt download list
 The output is shown below:
 
 ```
+allenai/OLMo-1b-hf
+allenai/OLMo-7b-hf
+allenai/OLMo-7b-Instruct-hf
 codellama/CodeLlama-13b-hf
 codellama/CodeLlama-13b-Instruct-hf
 codellama/CodeLlama-13b-Python-hf