From 1bf0eef3c08ffbbafd8e6a7e363969eb6623dfca Mon Sep 17 00:00:00 2001 From: jordiclive Date: Sat, 6 May 2023 09:00:25 +0100 Subject: [PATCH 1/8] no message --- model/model_eval/manual/sampling_report.py | 10 +- model/model_training/configs/config.yaml | 26 + model/model_training/models/peft_modeling.py | 119 +++ model/model_training/models/prefix_llama.py | 958 +++++++++++++++++++ model/model_training/trainer_sft.py | 7 + model/model_training/utils/utils.py | 22 +- model/pyproject.toml | 1 + 7 files changed, 1132 insertions(+), 11 deletions(-) create mode 100644 model/model_training/models/peft_modeling.py create mode 100644 model/model_training/models/prefix_llama.py diff --git a/model/model_eval/manual/sampling_report.py b/model/model_eval/manual/sampling_report.py index 4d96585bd8..fa53f149d7 100644 --- a/model/model_eval/manual/sampling_report.py +++ b/model/model_eval/manual/sampling_report.py @@ -9,6 +9,7 @@ import pydantic import torch +from model_training.models.peft_modeling import load_peft_model from tqdm import tqdm from transformers import AutoTokenizer, PreTrainedTokenizer @@ -115,9 +116,9 @@ def sample( ).to(device) input_ids = inputs.input_ids outputs = model.generate( - input_ids, - **sampling_params, + input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, + **sampling_params, ) if skip_input_tokens: output_tokens = outputs[0, input_ids.size(1) :] @@ -232,6 +233,7 @@ def parse_args(): parser.add_argument("--max-input-len", type=int, help="max token counts for input") parser.add_argument("--auth-token", type=str) parser.add_argument("--num-threads", type=int, default=8) + parser.add_argument("--peft_model", type=str, default=None) return parser.parse_args() @@ -291,6 +293,10 @@ def main(): else: raise RuntimeError("Invalid model_type specified") + if args.peft_model is not None: + tokenizer = AutoTokenizer.from_pretrained(args.peft_model) + model = load_peft_model(model, args.peft_model, tokenizer) + print("special_tokens_map:", tokenizer.special_tokens_map) print(f"eos_token='{tokenizer.eos_token}', eos_token_id={tokenizer.eos_token_id}") diff --git a/model/model_training/configs/config.yaml b/model/model_training/configs/config.yaml index 5be455551c..e545f07444 100644 --- a/model/model_training/configs/config.yaml +++ b/model/model_training/configs/config.yaml @@ -81,6 +81,8 @@ defaults: per_digit_tokens: false is_reward_model: false deepspeed_config: configs/zero_config.json + peft_model: false + peft_type: "lora" webgpt_dataset_only: datasets: @@ -354,6 +356,30 @@ llama-30b-pretrain: num_train_epochs: 1 save_total_limit: 2 +lora-llama-13b: + dtype: fp16 + log_dir: "llama_log_7b" + learning_rate: 5e-5 + model_name: "/admin/home-jordiclive/llama/7B" + output_dir: llama_model + weight_decay: 0.0 + max_length: 2048 + warmup_steps: 300 + gradient_checkpointing: true + gradient_accumulation_steps: 1 + per_device_train_batch_size: 24 + per_device_eval_batch_size: 5 + eval_steps: 5 + num_train_epochs: 12 + save_total_limit: 2 + save_strategy: epoch + use_flash_attention: True + residual_dropout: 0.0 + deepspeed_config: configs/zero_config.json + peft_model: true + peft_type: "lora" + use_custom_sampler: true + pythia-70m-deduped: learning_rate: 8e-6 # model_name: EleutherAI/pythia-1b-deduped diff --git a/model/model_training/models/peft_modeling.py b/model/model_training/models/peft_modeling.py new file mode 100644 index 0000000000..fa368a45a0 --- /dev/null +++ b/model/model_training/models/peft_modeling.py @@ -0,0 +1,119 @@ +from dataclasses import dataclass +from pathlib import Path + +import torch +from huggingface_hub import hf_hub_download +from model_training.utils.utils import get_model, get_tokenizer +from peft import LoraConfig, PeftModel, PrefixTuningConfig, get_peft_model, prepare_model_for_int8_training + + +def load_peft_model(model, peft_model_path, tokenizer): + model.resize_token_embeddings(len(tokenizer)) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.bos_token_id = tokenizer.bos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + model = PeftModel.from_pretrained( + model, + peft_model_path, + torch_dtype=model.dtype, + ) + model.eos_token_id = tokenizer.eos_token_id + extra_embeds = hf_hub_download(peft_model_path, "extra_embeddings.pt") + embed_weights = torch.load(extra_embeds, map_location=model.device) + model.base_model.model.model.embed_tokens.weight[len(tokenizer) - embed_weights.shape[0] :, :] = embed_weights.to( + model.base_model.model.model.embed_tokens.weight.dtype + ) + return model + + +def prepare_model_for_gradient_checkpointing(model): + r""" + Prepares the model for gradient checkpointing if necessary + """ + if not getattr(model, "is_loaded_in_8bit", False): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + return model + + +def peft_model(model, peft_type="lora", int8_training=False, gradient_checkpointing=False): + if peft_type == "lora": + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + elif peft_type == "prefix-tuning": + config = PrefixTuningConfig( + num_virtual_tokens=30, prefix_projection=True, encoder_hidden_size=1024, task_type="CAUSAL_LM" + ) + else: + raise ValueError("peft_method config is lora or prefix-tuning") + model = get_peft_model(model, config) + if int8_training: + model = prepare_model_for_int8_training(model) + + if gradient_checkpointing: + model = prepare_model_for_gradient_checkpointing(model) + model.print_trainable_parameters() + return model + + +@dataclass +class SaveLoraConfig: + dtype: torch.dtype = torch.float16 + is_reward_model: bool = False + quantization: bool = False + seq2seqmodel: bool = False + freeze_layer: bool = False + residual_dropout: float = 0 + use_flash_attention: bool = False + adapter_save_path: str = "adapter_13B_new" + cache_dir: str = "" + model_name: str = "" + torch_ckpt_path: str = "" + peft_type: str = "lora" + + +def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): + tokenizer = get_tokenizer(save_config) + model = get_model(save_config, tokenizer) + model = peft_model(model) + model.load_state_dict(torch.load(save_config.torch_ckpt_path)) + vocab_size = tokenizer.vocab_size + num_special_tokens = len(tokenizer.additional_special_tokens) + + new_embs = model.state_dict()["base_model.model.model.embed_tokens.weight"][ + vocab_size : vocab_size + num_special_tokens, : + ].clone() + new_embs = new_embs.to(save_config.dtype) + model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) + tokenizer.save_pretrained(save_config.adapter_save_path) + torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) + + +def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): + tokenizer = get_tokenizer(save_config) + save_config.model_name = "decapoda-research/llama-13b-hf" + model = get_model(save_config, tokenizer) + model = peft_model(model) + model.load_state_dict(torch.load(save_config.torch_ckpt_path)) + vocab_size = tokenizer.vocab_size + num_special_tokens = len(tokenizer.additional_special_tokens) + + new_embs = model.state_dict()["base_model.model.model.embed_tokens.weight"][ + vocab_size : vocab_size + num_special_tokens, : + ].clone() + new_embs = new_embs.to(save_config.dtype) + model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) + tokenizer.save_pretrained(save_config.adapter_save_path) + torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) diff --git a/model/model_training/models/prefix_llama.py b/model/model_training/models/prefix_llama.py new file mode 100644 index 0000000000..954db998f9 --- /dev/null +++ b/model/model_training/models/prefix_llama.py @@ -0,0 +1,958 @@ +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch LLaMA model.""" +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers import LlamaConfig +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaConfig" + + +def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min)) + mask_cond = torch.arange(mask.size(-1)) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self.max_seq_len_cached = max_position_embeddings + t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + if seq_len > self.max_seq_len_cached: + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): + cos = cos[..., offset : q.shape[-2] + offset, :] + sin = sin[..., offset : q.shape[-2] + offset, :] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class LlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + hidden_size: int, + num_heads: int, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = hidden_size // num_heads + + if (self.head_dim * num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {num_heads})." + ) + self.q_proj = nn.Linear( + hidden_size, + num_heads * self.head_dim, + bias=False, + ) + self.k_proj = nn.Linear( + hidden_size, + num_heads * self.head_dim, + bias=False, + ) + self.v_proj = nn.Linear( + hidden_size, + num_heads * self.head_dim, + bias=False, + ) + self.o_proj = nn.Linear( + num_heads * self.head_dim, + hidden_size, + bias=False, + ) + self.rotary_emb = LlamaRotaryEmbedding(self.head_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + offset = 0 + if past_key_value is not None: + offset = past_key_value[0].shape[-2] + kv_seq_len += offset + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, offset=offset) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = LlamaAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + ) + self.mlp = LlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = LlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LlamaModel): + module.gradient_checkpointing = value + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length + ).to(inputs_embeds.device) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + tgt_len = input_shape[-1] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=tgt_len).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + output_attentions, + False, + past_key_value, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class LlamaForCausalLM(LlamaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model/pipeline parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + LLAMA_START_DOCSTRING, +) +class LlamaForSequenceClassification(LlamaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = LlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/model/model_training/trainer_sft.py b/model/model_training/trainer_sft.py index 94b2265bf6..6572bcf0e6 100755 --- a/model/model_training/trainer_sft.py +++ b/model/model_training/trainer_sft.py @@ -9,6 +9,7 @@ import torch from model_training.custom_datasets.dialogue_collator import DialogueDataCollator from model_training.efficiency_utils import fuse_gelu +from model_training.models.peft_modeling import peft_model from model_training.utils.utils import ( PerDatasetSampler, _strtobool, @@ -396,6 +397,12 @@ def main(): model = get_model(training_conf, tokenizer) + if training_conf.peft_model: + print("Using PEFT model") + model = peft_model( + model, peft_type=training_conf.peft_type, gradient_checkpointing=training_conf.gradient_checkpointing + ) + if training_conf.quantization: import bitsandbytes # This is noisy, so delay importing until after argument parsing so it doesn't make --help noisy diff --git a/model/model_training/utils/utils.py b/model/model_training/utils/utils.py index 6acd8356e1..6c15573e8f 100644 --- a/model/model_training/utils/utils.py +++ b/model/model_training/utils/utils.py @@ -15,6 +15,7 @@ from model_training.custom_datasets.formatting import QA_SPECIAL_TOKENS from model_training.models import freeze_top_n_layers, get_specific_model from model_training.models.patching import patch_model +from model_training.models.prefix_llama import LlamaForCausalLM from model_training.models.reward_model import GPTNeoXRewardModel from sklearn.model_selection import train_test_split from tokenizers import pre_tokenizers @@ -317,15 +318,18 @@ def get_model(conf, tokenizer, pad_vocab_size_to_multiple_of=16, check_freeze_la model = transformers.AutoModelForSequenceClassification.from_pretrained( conf.model_name, cache_dir=conf.cache_dir, num_labels=1, torch_dtype=dtype ) - else: - model = get_specific_model( - conf.model_name, - cache_dir=conf.cache_dir, - quantization=conf.quantization, - seq2seqmodel=conf.seq2seqmodel, - without_head=conf.is_reward_model, - torch_dtype=dtype, - ) + if not conf.is_reward_model: + if conf.peft_type is not None and conf.peft_type == "prefix-tuning" and "llama" in conf.model_name: + model = LlamaForCausalLM.from_pretrained(conf.model_name, cache_dir=conf.cache_dir, torch_dtype=dtype) + else: + model = get_specific_model( + conf.model_name, + cache_dir=conf.cache_dir, + quantization=conf.quantization, + seq2seqmodel=conf.seq2seqmodel, + without_head=conf.is_reward_model, + torch_dtype=dtype, + ) n_embs = model.get_input_embeddings().num_embeddings if len(tokenizer) != n_embs and check_freeze_layer: diff --git a/model/pyproject.toml b/model/pyproject.toml index 0676f30f40..ee3d6532be 100644 --- a/model/pyproject.toml +++ b/model/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "ninja>=1.11.1", "nltk>=3.8.1", "numpy>=1.22.4", + "peft==0.2.0", "py7zr", "PyYAML>=6.0", "sentencepiece>=0.1.97", From 11a8971b56dd55819cd8f22146e8a3e86621f783 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Sat, 6 May 2023 11:17:24 +0100 Subject: [PATCH 2/8] fix configs. --- model/model_training/configs/config.yaml | 50 +++++++++++++++- .../configs/zero_config_sft_65b.json | 58 +++++++++++++++++++ model/model_training/models/peft_modeling.py | 32 +++++----- 3 files changed, 123 insertions(+), 17 deletions(-) create mode 100644 model/model_training/configs/zero_config_sft_65b.json diff --git a/model/model_training/configs/config.yaml b/model/model_training/configs/config.yaml index e545f07444..cbec457e2b 100644 --- a/model/model_training/configs/config.yaml +++ b/model/model_training/configs/config.yaml @@ -369,7 +369,7 @@ lora-llama-13b: gradient_accumulation_steps: 1 per_device_train_batch_size: 24 per_device_eval_batch_size: 5 - eval_steps: 5 + eval_steps: 500 num_train_epochs: 12 save_total_limit: 2 save_strategy: epoch @@ -380,6 +380,54 @@ lora-llama-13b: peft_type: "lora" use_custom_sampler: true +lora-llama-30b: + dtype: fp16 + log_dir: "llama_log_7b" + learning_rate: 5e-5 + model_name: "/admin/home-jordiclive/llama/30B" + output_dir: llama_model + weight_decay: 0.0 + max_length: 2048 + warmup_steps: 300 + gradient_checkpointing: true + gradient_accumulation_steps: 1 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 2 + eval_steps: 500 + num_train_epochs: 12 + save_total_limit: 2 + save_strategy: epoch + use_flash_attention: True + residual_dropout: 0.0 + deepspeed_config: configs/zero_config.json + peft_model: true + peft_type: "lora" + use_custom_sampler: true + +lora-llama-65b: + dtype: fp16 + log_dir: "llama_log_7b" + learning_rate: 5e-5 + model_name: "/admin/home-jordiclive/llama/65B" + output_dir: llama_model + weight_decay: 0.0 + max_length: 2048 + warmup_steps: 300 + gradient_checkpointing: true + gradient_accumulation_steps: 1 + per_device_train_batch_size: 16 + per_device_eval_batch_size: 5 + eval_steps: 250 + num_train_epochs: 12 + save_total_limit: 2 + save_strategy: epoch + use_flash_attention: True + residual_dropout: 0.0 + deepspeed_config: "zero_config_sft_65b.json" + peft_model: true + peft_type: "lora" + use_custom_sampler: true + pythia-70m-deduped: learning_rate: 8e-6 # model_name: EleutherAI/pythia-1b-deduped diff --git a/model/model_training/configs/zero_config_sft_65b.json b/model/model_training/configs/zero_config_sft_65b.json new file mode 100644 index 0000000000..cf1c1c4c3a --- /dev/null +++ b/model/model_training/configs/zero_config_sft_65b.json @@ -0,0 +1,58 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "warmup_type": "linear", + "total_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 2e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 2e9, + "stage3_max_reuse_distance": 2e9, + "stage3_gather_16bit_weights_on_model_save": true, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + } + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/model/model_training/models/peft_modeling.py b/model/model_training/models/peft_modeling.py index fa368a45a0..e4c9f417e8 100644 --- a/model/model_training/models/peft_modeling.py +++ b/model/model_training/models/peft_modeling.py @@ -101,19 +101,19 @@ def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) -def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): - tokenizer = get_tokenizer(save_config) - save_config.model_name = "decapoda-research/llama-13b-hf" - model = get_model(save_config, tokenizer) - model = peft_model(model) - model.load_state_dict(torch.load(save_config.torch_ckpt_path)) - vocab_size = tokenizer.vocab_size - num_special_tokens = len(tokenizer.additional_special_tokens) - - new_embs = model.state_dict()["base_model.model.model.embed_tokens.weight"][ - vocab_size : vocab_size + num_special_tokens, : - ].clone() - new_embs = new_embs.to(save_config.dtype) - model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) - tokenizer.save_pretrained(save_config.adapter_save_path) - torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) +# def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): +# tokenizer = get_tokenizer(save_config) +# save_config.model_name = "decapoda-research/llama-13b-hf" +# model = get_model(save_config, tokenizer) +# model = peft_model(model) +# model.load_state_dict(torch.load(save_config.torch_ckpt_path)) +# vocab_size = tokenizer.vocab_size +# num_special_tokens = len(tokenizer.additional_special_tokens) +# +# new_embs = model.state_dict()["base_model.model.model.embed_tokens.weight"][ +# vocab_size : vocab_size + num_special_tokens, : +# ].clone() +# new_embs = new_embs.to(save_config.dtype) +# model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) +# tokenizer.save_pretrained(save_config.adapter_save_path) +# torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) From d7efdad2def228833fcc185d527f9d2513453c08 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Sat, 6 May 2023 11:27:28 +0100 Subject: [PATCH 3/8] no message --- model/model_training/configs/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model_training/configs/config.yaml b/model/model_training/configs/config.yaml index cbec457e2b..bd0510064a 100644 --- a/model/model_training/configs/config.yaml +++ b/model/model_training/configs/config.yaml @@ -415,7 +415,7 @@ lora-llama-65b: warmup_steps: 300 gradient_checkpointing: true gradient_accumulation_steps: 1 - per_device_train_batch_size: 16 + per_device_train_batch_size: 12 per_device_eval_batch_size: 5 eval_steps: 250 num_train_epochs: 12 From 51b83097d9afc0b5fb9acddd4433a768c47d8a73 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Sat, 6 May 2023 11:28:35 +0100 Subject: [PATCH 4/8] no message --- model/model_training/models/peft_modeling.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/model/model_training/models/peft_modeling.py b/model/model_training/models/peft_modeling.py index e4c9f417e8..28f3c264a7 100644 --- a/model/model_training/models/peft_modeling.py +++ b/model/model_training/models/peft_modeling.py @@ -100,20 +100,3 @@ def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): tokenizer.save_pretrained(save_config.adapter_save_path) torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) - -# def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): -# tokenizer = get_tokenizer(save_config) -# save_config.model_name = "decapoda-research/llama-13b-hf" -# model = get_model(save_config, tokenizer) -# model = peft_model(model) -# model.load_state_dict(torch.load(save_config.torch_ckpt_path)) -# vocab_size = tokenizer.vocab_size -# num_special_tokens = len(tokenizer.additional_special_tokens) -# -# new_embs = model.state_dict()["base_model.model.model.embed_tokens.weight"][ -# vocab_size : vocab_size + num_special_tokens, : -# ].clone() -# new_embs = new_embs.to(save_config.dtype) -# model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) -# tokenizer.save_pretrained(save_config.adapter_save_path) -# torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) From 2becaf42962c267246ea2ac93b2061477f43c7ee Mon Sep 17 00:00:00 2001 From: jordiclive Date: Sat, 6 May 2023 11:38:24 +0100 Subject: [PATCH 5/8] no message --- model/model_training/models/peft_modeling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model/model_training/models/peft_modeling.py b/model/model_training/models/peft_modeling.py index 28f3c264a7..23b8af35f5 100644 --- a/model/model_training/models/peft_modeling.py +++ b/model/model_training/models/peft_modeling.py @@ -99,4 +99,3 @@ def save_adapter_model_from_ckpt(save_config: SaveLoraConfig): model.save_pretrained(save_config.adapter_save_path, torch_dtype=save_config.dtype) tokenizer.save_pretrained(save_config.adapter_save_path) torch.save(new_embs, Path(save_config.adapter_save_path).joinpath("extra_embeddings.pt")) - From ad589b31d1355c7fb3dc343e07c537102718f5b6 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Sat, 6 May 2023 18:10:41 +0100 Subject: [PATCH 6/8] Update config.yaml change log dir names and model name for lora 13B in config --- model/model_training/configs/config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model/model_training/configs/config.yaml b/model/model_training/configs/config.yaml index bd0510064a..eb7c25d6ce 100644 --- a/model/model_training/configs/config.yaml +++ b/model/model_training/configs/config.yaml @@ -358,9 +358,9 @@ llama-30b-pretrain: lora-llama-13b: dtype: fp16 - log_dir: "llama_log_7b" + log_dir: "llama_lora_log_13b" learning_rate: 5e-5 - model_name: "/admin/home-jordiclive/llama/7B" + model_name: "/admin/home-jordiclive/llama/13B" output_dir: llama_model weight_decay: 0.0 max_length: 2048 @@ -382,7 +382,7 @@ lora-llama-13b: lora-llama-30b: dtype: fp16 - log_dir: "llama_log_7b" + log_dir: "llama_lora_log_30b" learning_rate: 5e-5 model_name: "/admin/home-jordiclive/llama/30B" output_dir: llama_model @@ -406,7 +406,7 @@ lora-llama-30b: lora-llama-65b: dtype: fp16 - log_dir: "llama_log_7b" + log_dir: "llama_lora_log_65b" learning_rate: 5e-5 model_name: "/admin/home-jordiclive/llama/65B" output_dir: llama_model From 607dead4f4aa09f407e84eb1fd3ee5d620949b3b Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Sat, 6 May 2023 18:14:53 +0100 Subject: [PATCH 7/8] Update peft_modeling.py change output path for adapter --- model/model_training/models/peft_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model_training/models/peft_modeling.py b/model/model_training/models/peft_modeling.py index 23b8af35f5..54d51caa06 100644 --- a/model/model_training/models/peft_modeling.py +++ b/model/model_training/models/peft_modeling.py @@ -77,7 +77,7 @@ class SaveLoraConfig: freeze_layer: bool = False residual_dropout: float = 0 use_flash_attention: bool = False - adapter_save_path: str = "adapter_13B_new" + adapter_save_path: str = "adapter" cache_dir: str = "" model_name: str = "" torch_ckpt_path: str = "" From 82da4ba13922f84b579dbfeef40ac6929878650c Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 9 May 2023 10:12:23 +0100 Subject: [PATCH 8/8] update config output dirs --- model/model_training/configs/config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/model/model_training/configs/config.yaml b/model/model_training/configs/config.yaml index eb7c25d6ce..294c855534 100644 --- a/model/model_training/configs/config.yaml +++ b/model/model_training/configs/config.yaml @@ -360,8 +360,8 @@ lora-llama-13b: dtype: fp16 log_dir: "llama_lora_log_13b" learning_rate: 5e-5 - model_name: "/admin/home-jordiclive/llama/13B" - output_dir: llama_model + model_name: /home/ubuntu/llama_hf/13B + output_dir: llama_model_13b_lora weight_decay: 0.0 max_length: 2048 warmup_steps: 300 @@ -384,8 +384,8 @@ lora-llama-30b: dtype: fp16 log_dir: "llama_lora_log_30b" learning_rate: 5e-5 - model_name: "/admin/home-jordiclive/llama/30B" - output_dir: llama_model + model_name: /home/ubuntu/llama_hf/30B + output_dir: llama_model_30b_lora weight_decay: 0.0 max_length: 2048 warmup_steps: 300 @@ -408,8 +408,8 @@ lora-llama-65b: dtype: fp16 log_dir: "llama_lora_log_65b" learning_rate: 5e-5 - model_name: "/admin/home-jordiclive/llama/65B" - output_dir: llama_model + model_name: /home/ubuntu/llama_hf/65B + output_dir: llama_model_65b_lora weight_decay: 0.0 max_length: 2048 warmup_steps: 300 @@ -423,7 +423,7 @@ lora-llama-65b: save_strategy: epoch use_flash_attention: True residual_dropout: 0.0 - deepspeed_config: "zero_config_sft_65b.json" + deepspeed_config: configs/zero_config_sft_65b.json peft_model: true peft_type: "lora" use_custom_sampler: true