Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding OLMo #1827

Merged
merged 12 commits into from
Nov 13, 2024
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ Every model is written from scratch to maximize performance and remove layers of
| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) |
| Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) |
| Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) |
| OLMo | 1B, 7B | Allen Institute for AI (AI2) | [OLMo project page](https://allenai.org/olmo) |
aflah02 marked this conversation as resolved.
Show resolved Hide resolved
| OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) |
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) |
Expand Down
71 changes: 69 additions & 2 deletions litgpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,21 @@ def mlp_class(self) -> Type:
@property
def norm_class(self) -> Type:
# `self.norm_class_name` cannot be the type to keep the config serializable
if self.norm_class_name == "RMSNorm":
from functools import partial

from functools import partial

if self.norm_class_name == "RMSNorm":

from litgpt.model import RMSNorm

return partial(RMSNorm, add_unit_offset="Gemma" in self.name)

if self.norm_class_name == "LayerNorm" and "OLMo" in self.name:
# this makes it equivalent to `torch.nn.functional.layer_norm`
# that is used by OLMo
# Table 5 caption in the OLMo paper shows this - https://aclanthology.org/2024.acl-long.841
return partial(torch.nn.LayerNorm, elementwise_affine=False)

return getattr(torch.nn, self.norm_class_name)


Expand Down Expand Up @@ -1027,6 +1036,64 @@ def norm_class(self) -> Type:
rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192)
),
)

#################
# Allen AI OLMo
#################
olmo = [
# https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json
dict(
name="OLMo-1b-hf",
hf_config=dict(org="allenai", name="OLMo-1b-hf"),
vocab_size=50280,
padded_vocab_size=50304,
block_size=2048,
n_embd=2048,
n_layer=16,
n_head=16,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
norm_class_name="LayerNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=8192,
),
# https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json
dict(
name="OLMo-7b-hf",
hf_config=dict(org="allenai", name="OLMo-7b-hf"),
vocab_size=50280,
padded_vocab_size=50304,
block_size=2048,
n_layer=32,
n_head=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
norm_class_name="LayerNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json
dict(
name="OLMo-7b-Instruct-hf",
hf_config=dict(org="allenai", name="OLMo-7b-Instruct-hf"),
vocab_size=50280,
padded_vocab_size=50304,
block_size=2048,
n_layer=32,
n_head=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
norm_class_name="LayerNorm",
mlp_class_name="LLaMAMLP",
intermediate_size=11008,
),
]

configs.extend(olmo)

###############
# Google Gemma
###############
Expand Down
8 changes: 8 additions & 0 deletions litgpt/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,11 @@ def apply(self, prompt: str, **kwargs: str) -> str:
return f"<|prompt|>{prompt}</s><|answer|>"


class OLMo(PromptStyle):
def apply(self, prompt: str, **kwargs: str) -> str:
return f"<|endoftext|><|user|>\n{prompt}\n<|assistant|>\n"


# Maps prompt style names to PromptStyle classes
prompt_styles: Dict[str, Type[PromptStyle]] = {
# Dataset-specific prompt styles
Expand Down Expand Up @@ -354,6 +359,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
"gemma": Gemma,
"h2oai": H2Oai,
"llama3": Llama3,
"olmo": OLMo,
}


Expand Down Expand Up @@ -400,6 +406,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
return Gemma()
if re.search("Danube2.*-chat", model_name):
return H2Oai()
if re.search(r"OLMo.*-hf", model_name):
return OLMo()
return Default()


Expand Down
5 changes: 4 additions & 1 deletion litgpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ def encode(

if eos and (not tokens or tokens[-1] != self.eos_id):
tokens = tokens + [self.eos_id]

# if the processor misbehaves and adds `eos` token no matter what
elif tokens and tokens[-1] == self.eos_id:
tokens = tokens[:-1]

if max_length > 0:
tokens = tokens[:max_length]
return torch.tensor(tokens, dtype=torch.int, device=device)
Expand Down
4 changes: 4 additions & 0 deletions tutorials/download_model_weights.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
| Mixtral MoE | 8x7B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/mixtral-of-experts/) |
| Mistral | 7B, 123B | Mistral AI | [Mistral AI 2023](https://mistral.ai/news/announcing-mistral-7b/) |
| Nous-Hermes | 7B, 13B, 70B | NousResearch | [Org page](https://huggingface.co/NousResearch) |
| OLMo | 1B, 7B | Allen Institute for AI (AI2) | [ OLMo project page](https://allenai.org/olmo) |
aflah02 marked this conversation as resolved.
Show resolved Hide resolved
| OpenLLaMA | 3B, 7B, 13B | OpenLM Research | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) |
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
| Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
Expand Down Expand Up @@ -54,6 +55,9 @@ litgpt download list
The output is shown below:

```
allenai/OLMo-1b-hf
allenai/OLMo-7b-hf
allenai/OLMo-7b-Instruct-hf
codellama/CodeLlama-13b-hf
codellama/CodeLlama-13b-Instruct-hf
codellama/CodeLlama-13b-Python-hf
Expand Down