Skip to content
This repository has been archived by the owner on Oct 16, 2023. It is now read-only.

match checkpoint for opt #95

Merged
merged 1 commit into from
Jul 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion energonai/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from .model_factory import gpt2_8B, gpt3, hf_gpt2
from .model_factory import gpt2_small, gpt2_large, gpt2_8B, gpt3
from .model_factory import hf_gpt2
from .model_factory import bert_small, bert_large, bert_8B, bert_175B
from .model_factory import opt_125M, opt_30B, opt_66B
3 changes: 2 additions & 1 deletion energonai/model/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def _split_heads(self, tensor, num_heads, attn_head_size):

def forward(self,
hidden_states,
attention_mask=None):
attention_mask=None,
seq_lens=None):

if self.fused_qkv:
qkv = self.query_key_value(hidden_states)
Expand Down
2 changes: 1 addition & 1 deletion energonai/model/endecoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self,
activation = activation,
dtype = dtype,
bias = bias)
def forward(self, hidden_states, attention_mask=None):
def forward(self, hidden_states, attention_mask=None, seq_lens=None):

if not self.apply_post_layernorm:
residual = hidden_states
Expand Down
50 changes: 43 additions & 7 deletions energonai/model/model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(self,
self.blocks = nn.ModuleList()
self.pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) if is_using_pp() else 0
for id_ in range(depth):
self.blocks.add_module(f'blk_{id_ + self.pp_rank * depth}',
self.blocks.add_module(f'{id_ + self.pp_rank * depth}',
Block1D(hidden_size=hidden_size,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
Expand Down Expand Up @@ -94,7 +94,7 @@ def forward(self, hidden_states=None, input_ids=None, attention_mask=None, seq_l
attention_mask = (1.0 - attention_mask) * -10000.0

for block in self.blocks:
hidden_states = block(hidden_states, attention_mask)
hidden_states = block(hidden_states, attention_mask) # seq_lens

if self.last:
hidden_states = self.head(self.norm(hidden_states))
Expand Down Expand Up @@ -157,6 +157,9 @@ def create_pipeline_model(depth:int = 48,
if model_kwargs["model_name"] == "hf_gpt2":
from energonai.utils.checkpointing_hf_gpt2 import load_checkpoint
load_checkpoint(model_kwargs["checkpoint"], model, **model_kwargs)
if model_kwargs["model_name"] == "opt":
from energonai.utils.checkpointing_opt import load_checkpoint
load_checkpoint(model_kwargs["checkpoint"], model, **model_kwargs)

return model

Expand Down Expand Up @@ -205,14 +208,47 @@ def bert_175B(**kwargs):
model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, is_decoder = False, **kwargs)
return create_pipeline_model(**model_kwargs)

def opt_125M(**kwargs):
model_kwargs = dict(vocab_size=50272,
hidden_size=768,
depth=12,
max_seq_len=2050,
num_heads=12,
activation=nn.functional.relu,
is_decoder = True,
fused_qkv=False,
model_name = "opt",
**kwargs)
return create_pipeline_model(**model_kwargs)

def opt_30B(**kwargs):
model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, is_decoder = True, **kwargs)
model_kwargs = dict(vocab_size=50272,
hidden_size=7168,
depth=48,
max_seq_len=2050,
num_heads=56,
activation=nn.functional.relu,
is_decoder = True,
fused_qkv=False,
model_name = "opt",
**kwargs)
return create_pipeline_model(**model_kwargs)

def opt_66B(**kwargs):
model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, is_decoder = True, **kwargs)
model_kwargs = dict(vocab_size=50272,
hidden_size=9216,
depth=64,
max_seq_len=2050,
num_heads=72,
activation=nn.functional.relu,
is_decoder = True,
fused_qkv=False,
model_name = "opt",
**kwargs)
return create_pipeline_model(**model_kwargs)

def opt_175B(**kwargs):
model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, is_decoder = True, **kwargs)
return create_pipeline_model(**model_kwargs)


# def opt_175B(**kwargs):
# model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, activation=nn.functional.relu, is_decoder = True, **kwargs)
# return create_pipeline_model(**model_kwargs)
6 changes: 3 additions & 3 deletions energonai/utils/checkpointing_hf_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def processing_HF_GPT(state_dict: OrderedDict):
if judge_t(new_k):
new_v = torch.transpose(new_v, 0, 1)
if "attn.query_key_value.weight" in new_k:
num_ = re.search(r"blocks\.blk_\d+?\.", new_k)
num_ = re.search(r"blocks\.\d+?\.", new_k)
if num_:
prefix = num_.group()
else:
Expand All @@ -314,7 +314,7 @@ def processing_HF_GPT(state_dict: OrderedDict):
new_dict[prefix + "attn.key_.weight"] = k_
new_dict[prefix + "attn.value_.weight"] = v_
elif "attn.query_key_value.bias" in new_k:
num_ = re.search(r"blocks\.blk_\d+?\.", new_k)
num_ = re.search(r"blocks\.\d+?\.", new_k)
if num_:
prefix = num_.group()
else:
Expand All @@ -334,7 +334,7 @@ def processing_HF_GPT(state_dict: OrderedDict):

def id_map(matched):
value = matched.group('value')
return "blocks.blk_{}.".format(value)
return "blocks.{}.".format(value)


def module_name_mapping(ori_name: str):
Expand Down
Loading