From 10aa63f0e112eac79e5fcafc6dc75961c5b76403 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Sat, 7 Oct 2023 14:03:47 +0800 Subject: [PATCH 001/153] support optimized sp --- configs/7B_sft.py | 6 +- internlm/model/linear.py | 219 ++++++++++++++++++++++++- internlm/model/modeling_internlm.py | 22 ++- internlm/model/multi_head_attention.py | 140 +++++++++++++++- train.py | 21 +-- 5 files changed, 378 insertions(+), 30 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 25a98bf8..a23edcec 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -146,10 +146,10 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=8, - tensor=1, + zero1=-1, + tensor=2, pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=False, + sequence_parallel=True, ) cudnn_deterministic = False diff --git a/internlm/model/linear.py b/internlm/model/linear.py index d18308a8..5ee1af9d 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -5,13 +5,32 @@ import torch from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear -from flash_attn.utils.distributed import all_reduce, reduce_scatter +from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw +from torch import Tensor from torch import nn +from torch.cuda.amp import custom_bwd, custom_fwd from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.model.utils import Silu, fused_dense_func_torch +from typing import Optional +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.distributed import ProcessGroup +from torch.cuda.amp import custom_bwd, custom_fwd + +# import fused_dense_cuda # from apex +import fused_dense_lib as fused_dense_cuda + +from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd +from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw +from flash_attn.utils.distributed import reduce_scatter, all_reduce + class ScaleColumnParallelLinear(nn.Linear): """ @@ -200,3 +219,201 @@ def forward(self, x): w2_o = self.w2(x) out = self.w3(Silu(w1_o, w2_o)) return out + +class FusedDenseFunc_fsdp(torch.autograd.Function): + + @staticmethod + @custom_fwd + def forward(ctx, x, weight, bias, return_residual=False, process_group=None): + + ctx.compute_weight_gradient = weight.requires_grad + ctx.return_residual = return_residual + ctx.process_group = process_group + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + x = x.contiguous() + total_x = x + + # do all_gather for weight and bias before actual computation + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + if bias is not None: + total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + handle_bias.wait() + else: + total_bias = bias + + if torch.is_autocast_enabled(): + total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) + total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + handle_weight.wait() + total_weight = total_weight.contiguous() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *total_weight.shape) > 65535 * 32: + raise RuntimeError('fused_dense only supports matrix dims <= 2M') + output = F.linear(total_x, total_weight, total_bias) + if ctx.compute_weight_gradient: + ctx.save_for_backward(x, weight) + else: + ctx.save_for_backward(weight) + return output if not return_residual else (output, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + grad_input, = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + if ctx.compute_weight_gradient: + x, weight = ctx.saved_tensors + total_x = x + else: + weight, = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + + # do all-gather for weight before backward + weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() + + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + # if process_group is not None: + # import pdb; pdb.set_trace() + # grad_input, handle_grad_input = reduce_scatter_raw(grad_input, process_group, async_op=True) + # grad_input, handle_grad_input = all_reduce_raw(grad_input, process_group, async_op=True) + + else: + grad_input = None + # import pdb; pdb.set_trace() + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + + grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + if grad_bias is not None: + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + handle_grad_bias.wait() + handle_grad_weight.wait() + + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + # if process_group is not None and ctx.needs_input_grad[0]: + # handle_grad_input.wait() + # import pdb; pdb.set_trace() + return grad_input, grad_weight, grad_bias, None, None, None + + +def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, + return_residual: bool = False, process_group = None): + dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16] + or (x.dtype == torch.float32 and torch.is_autocast_enabled())) + if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: + return FusedDenseFunc_fsdp.apply(x, weight, bias, return_residual, process_group) + else: + assert process_group is None + out = F.linear(x, weight, bias) + return out if not return_residual else (out, x) + +class FSDPLinear(ColumnParallelLinear): + + def forward(self, x): + return fsdp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group) + + +class FSDPScaleLinear(ScaleColumnParallelLinear): + + def forward(self, input): # pylint: disable=W0622 + # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + # we do an all_gather of x before doing the matmul. + # If not, then the input is already gathered. + if self.weight_scale != 1: + weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() + else: + weight = self.weight + return fsdp_fused_dense_func( + input, + weight, + self.bias, + process_group=self.process_group, + ) + + +class FSDPFeedForward(nn.Module): + """ + FeedForward. + + Args: + in_features (int): size of each input sample + hidden_features (int): size of hidden state of FFN + out_features (int): size of each output sample + process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. + bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False + in the config. + device (Optional[Union[str, torch.device]]): The device will be used. + dtype (Optional[torch.dtype]): The type of data. + multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default. + """ + + def __init__( + self, + in_features: int, + hidden_features: int, + out_features: int = None, + process_group: Optional[torch.distributed.ProcessGroup] = None, + bias: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + multiple_of: int = 256, + ): + super().__init__() + + hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) + + self.w1 = FSDPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w2 = FSDPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w3 = FSDPLinear( + hidden_features, + out_features, + process_group, + bias=bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + + def forward(self, x): + w1_o = self.w1(x) + w2_o = self.w2(x) + out = self.w3(Silu(w1_o, w2_o)) + return out diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 2856a782..8ac8c58d 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -17,9 +17,11 @@ FeedForward, RewardModelLinear, ScaleColumnParallelLinear, + FSDPScaleLinear, + FSDPFeedForward, ) from internlm.model.multi_head_attention import MHA -from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm +from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm, split_forward_gather_backward from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs @@ -107,7 +109,16 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - self.mlp = FeedForward( + # self.mlp = FeedForward( + # hidden_size, + # int(hidden_size * mlp_ratio), + # out_features=hidden_size, + # process_group=gpc.get_group(ParallelMode.TENSOR), + # bias=False, + # device=device, + # dtype=dtype, + # ) + self.mlp = FSDPFeedForward( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, @@ -293,7 +304,8 @@ def __init__( if is_reward: head_cls = RewardModelLinear else: - head_cls = ScaleColumnParallelLinear + # head_cls = ScaleColumnParallelLinear + head_cls = FSDPScaleLinear if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -379,6 +391,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] + if gpc.config.parallel.sequence_parallel: + indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None for _, block in enumerate(self.blocks): @@ -394,6 +409,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N hidden_states = self.norm(hidden_states.float()) if hasattr(self, "head"): hidden_states = self.head(hidden_states) + hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=0) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index e4008e15..abb9f19c 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -18,7 +18,114 @@ from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode from internlm.core.context import global_context as gpc from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding -from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch +from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch, FSDPLinear + +import torch + +from typing import Any, Tuple +from torch import Tensor +from torch.nn import Module + +import torch.distributed as dist + + +class _SeqAllToAll(torch.autograd.Function): + + @staticmethod + def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor: + + ctx.group = group + ctx.scatter_idx = scatter_idx + ctx.gather_idx = gather_idx + + seq_world_size = dist.get_world_size(group) + + input_list = [t.contiguous() for t in torch.tensor_split(input, seq_world_size, scatter_idx)] + output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)] + # TODO Use all_to_all_single instead + dist.all_to_all(output_list, input_list, group=group) + return torch.cat(output_list, dim=gather_idx).contiguous() + + @staticmethod + def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]: + return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None) + + +class DistributedAttention(torch.nn.Module): + """Initialization. + + Arguments: + local_attention (Module): local attention with q,k,v + sequence_process_group (ProcessGroup): sequence parallel process group + scatter_idx (int): scatter_idx for all2all comm + gather_idx (int): gather_idx for all2all comm + """ + + def __init__( + self, + local_attention: Module, + sequence_process_group: dist.ProcessGroup, + scatter_idx: int = 2, + gather_idx: int = 0, + ) -> None: + + super(DistributedAttention, self).__init__() + self.local_attn = local_attention + self.spg = sequence_process_group + self.scatter_idx = scatter_idx + self.gather_idx = gather_idx + + # def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor: + # """ forward + + # Arguments: + # query (Tensor): query input to the layer + # key (Tensor): key input to the layer + # value (Tensor): value input to the layer + # args: other args + + # Returns: + # * output (Tensor): context output + # """ + # # TODO Merge three alltoall calls into one + # #in shape : e.g., [s/p:h:] + # query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx) + # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx) + # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx) + + # #out shape : e.g., [s:h/p:] + # context_layer = self.local_attn(query_layer, key_layer, value_layer, *args) + + # output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx) + + # #out e.g., [s/p::h] + # return output + + def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: + """ forward + + Arguments: + query (Tensor): query input to the layer + key (Tensor): key input to the layer + value (Tensor): value input to the layer + args: other args + + Returns: + * output (Tensor): context output + """ + # TODO Merge three alltoall calls into one + #in shape : e.g., [s/p:h:] + qkv = _SeqAllToAll.apply(self.spg, qkv, self.scatter_idx, self.gather_idx) + # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx) + # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx) + + #out shape : e.g., [s:h/p:] + context_layer = self.local_attn(qkv, **kwargs) + + output = _SeqAllToAll.apply(self.spg, context_layer, 0, 2) + + #out e.g., [s/p::h] + return output class MHA(nn.Module): @@ -91,7 +198,16 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - self.Wqkv = ColumnParallelLinearTorch( + # self.Wqkv = ColumnParallelLinearTorch( + # embed_dim, + # 3 * embed_dim, + # process_group, + # bias=True, + # sequence_parallel=gpc.config.parallel.sequence_parallel, + # **factory_kwargs, + # ) # according to https://spaces.ac.cn/archives/9577 + + self.Wqkv = FSDPLinear( embed_dim, 3 * embed_dim, process_group, @@ -106,9 +222,19 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) + + self.inner_attn_sp = DistributedAttention(self.inner_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0) + self.inner_cross_attn_sp = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0) # output projection always have the bias (for now) - self.out_proj = RowParallelLinearTorch( + # self.out_proj = RowParallelLinearTorch( + # embed_dim, + # embed_dim, + # process_group, + # sequence_parallel=gpc.config.parallel.sequence_parallel, + # **factory_kwargs, + # ) + self.out_proj = FSDPLinear( embed_dim, embed_dim, process_group, @@ -211,15 +337,17 @@ def _packed_forward(self, x, inference_params=None, **kwargs): qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim) # total x 3 x n_head x d qkv = self.rotary_emb(qkv, **kwargs) kwargs.pop("indexes") - + if inference_params is None: if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn: with torch.cuda.amp.autocast(dtype=torch.bfloat16): if qkv.dtype not in [torch.float16, torch.bfloat16]: qkv = qkv.to(torch.bfloat16) - context = self.inner_attn(qkv, **kwargs).to(x.dtype) + # context = self.inner_attn(qkv, **kwargs).to(x.dtype) + context = self.inner_attn_sp(qkv, **kwargs).to(x.dtype) else: - context = self.inner_attn(qkv, **kwargs) + # context = self.inner_attn(qkv, **kwargs) + context = self.inner_attn_sp(qkv, **kwargs) else: raise RuntimeError("Not support this right now") diff --git a/train.py b/train.py index 139bac1f..9bc4bd7f 100644 --- a/train.py +++ b/train.py @@ -110,7 +110,6 @@ def main(args): # initialize and resume train state train_state = TrainState(gpc.config, train_dl.batch_sampler) - optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) ckpt_manager = CheckpointManager( @@ -170,6 +169,7 @@ def main(args): beta2_scheduler=beta2_scheduler, scheduler_hooks=scheduler_hooks, ) + # initialize simple memory profiler if args.profiling: @@ -219,21 +219,9 @@ def main(args): # do forward and backward timer("fwd-bwd").start() - moe_loss = None - if hasattr(gpc.config.model, "num_experts"): - _, _, loss, moe_loss = trainer.execute_schedule( - batch, - forward_only=False, - return_loss=True, - return_output_label=False, - ) - else: - _, _, loss = trainer.execute_schedule( - batch, - forward_only=False, - return_loss=True, - return_output_label=False, - ) + _, _, loss = trainer.execute_schedule( + batch, forward_only=False, return_loss=True, return_output_label=False + ) timer("fwd-bwd").stop() # update parameters, and returns (success_update, grad_norm) @@ -266,7 +254,6 @@ def main(args): trainer=trainer, start_time=start_time, loss=loss, - moe_loss=moe_loss, grad_norm=grad_norm_groups, metric=metric, update_panel=uniscale_logger is not None, From bf475b694014b77159240feabba310a704cdbfdd Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Sun, 8 Oct 2023 13:20:29 +0800 Subject: [PATCH 002/153] debug --- configs/7B_sft.py | 4 ++-- .../core/scheduler/no_pipeline_scheduler.py | 4 ++-- internlm/model/linear.py | 19 +++++++++++++++---- internlm/model/modeling_internlm.py | 1 + train.py | 1 + 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 20119343..51d2e9c4 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 32 +NUM_LAYER = 4 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -55,7 +55,7 @@ # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate - valid_every=50, + valid_every=1000, pack_sample_into_one=False, total_steps=50000, skip_batches="", diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 56661d8c..97687904 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -202,10 +202,10 @@ def forward_backward_step( if return_output_label: outputs.append(_output) labels.append(_label) - + if not return_output_label: outputs, labels = None, None - + # Compatible for non-moe if hasattr(gpc.config.model, "num_experts"): return outputs, labels, loss, moe_loss diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 5ee1af9d..5ea0e80b 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -28,9 +28,20 @@ import fused_dense_lib as fused_dense_cuda from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd -from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw +from flash_attn.utils.distributed import all_gather_raw, all_reduce_raw +# reduce_scatter_raw from flash_attn.utils.distributed import reduce_scatter, all_reduce +def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, op=torch.distributed.ReduceOp.SUM): + world_size = torch.distributed.get_world_size(process_group) + assert input_.shape[0] % world_size == 0 + output = torch.empty( + input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device + ) + handle = torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), op=op, group=process_group, async_op=async_op + ) + return output, handle class ScaleColumnParallelLinear(nn.Linear): """ @@ -279,15 +290,15 @@ def backward(ctx, grad_output, *args): grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) # do all-gather for weight before backward - weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() if ctx.needs_input_grad[0]: if not ctx.return_residual: - grad_input = F.linear(grad_output, weight.t()) + grad_input = F.linear(grad_output, total_weight.t()) else: grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, weight) + grad_output, total_weight) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) # if process_group is not None: # import pdb; pdb.set_trace() diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 8ac8c58d..0db99ad0 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -372,6 +372,7 @@ def __init__( def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None): # attention_mask: compute attention on the places where the value is 1 + import pdb; pdb.set_trace() if hasattr(self, "embedding"): hidden_states = self.embedding(input_ids) if self.embed_grad_scale != 1: diff --git a/train.py b/train.py index 9bc4bd7f..1adcc22a 100644 --- a/train.py +++ b/train.py @@ -254,6 +254,7 @@ def main(args): trainer=trainer, start_time=start_time, loss=loss, + moe_loss=None, grad_norm=grad_norm_groups, metric=metric, update_panel=uniscale_logger is not None, From bd4af3a31f595ed6e587e5dccefca14535d9b8dd Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Sun, 8 Oct 2023 17:21:17 +0800 Subject: [PATCH 003/153] modify the all2all --- configs/7B_sft.py | 2 +- internlm/model/modeling_internlm.py | 1 - internlm/model/multi_head_attention.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 51d2e9c4..5e3e0c93 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 4 +NUM_LAYER = 32 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 0db99ad0..8ac8c58d 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -372,7 +372,6 @@ def __init__( def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None): # attention_mask: compute attention on the places where the value is 1 - import pdb; pdb.set_trace() if hasattr(self, "embedding"): hidden_states = self.embedding(input_ids) if self.embed_grad_scale != 1: diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index abb9f19c..e6d0a297 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -115,14 +115,14 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: """ # TODO Merge three alltoall calls into one #in shape : e.g., [s/p:h:] - qkv = _SeqAllToAll.apply(self.spg, qkv, self.scatter_idx, self.gather_idx) + qkv = _SeqAllToAll.apply(self.spg, qkv, 2, 0) # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx) # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx) #out shape : e.g., [s:h/p:] context_layer = self.local_attn(qkv, **kwargs) - output = _SeqAllToAll.apply(self.spg, context_layer, 0, 2) + output = _SeqAllToAll.apply(self.spg, context_layer, 0, 1) #out e.g., [s/p::h] return output From 189a313da6a6b6710f07f7e5e13cacb56eeb7256 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 17:26:20 +0800 Subject: [PATCH 004/153] support fstp and refactor code --- configs/7B_sft.py | 10 +-- internlm/core/context/parallel_context.py | 3 +- internlm/initialize/launch.py | 6 ++ internlm/model/linear.py | 91 +++++++------------ internlm/model/modeling_internlm.py | 29 +++--- internlm/model/multi_head_attention.py | 104 ++++++++-------------- internlm/utils/evaluation.py | 5 +- 7 files changed, 104 insertions(+), 144 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 5e3e0c93..6758167a 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 32 +NUM_LAYER = 4 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -55,7 +55,7 @@ # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate - valid_every=1000, + valid_every=10, pack_sample_into_one=False, total_steps=50000, skip_batches="", @@ -64,7 +64,7 @@ min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) @@ -135,7 +135,7 @@ num_layers=NUM_LAYER, mlp_ratio=MLP_RATIO, apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" norm_type="rmsnorm", layer_norm_epsilon=1e-5, use_flash_attn=True, @@ -155,7 +155,7 @@ """ parallel = dict( zero1=-1, - tensor=2, + tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp' pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 7f3e415a..da6a0d7e 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -568,7 +568,8 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False): # during model construction), this is because the random state will be different in different tensor parallel # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform # additional random operations during the RowParallelLinear module building process. - set_mode(ParallelMode.DUMMY) + # set_mode(ParallelMode.DUMMY) + set_mode(ParallelMode.TENSOR) seeds = get_seeds() seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()]) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 660cc559..895779e3 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -279,6 +279,12 @@ def args_sanity_check(): assert not ( gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False ), "sequence parallel does not support use_flash_attn=False" + + if gpc.config.parallel["tensor"].get("mode", None) is None: + gpc.config.parallel["tensor"]["mode"] = "origin_tp" + + if gpc.config.parallel["tensor"].get("mode", None) is 'fstp': + assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True." # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1: diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 5ea0e80b..60a3d272 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -4,44 +4,20 @@ from typing import Optional import torch +import torch.nn.functional as F from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw from torch import Tensor from torch import nn from torch.cuda.amp import custom_bwd, custom_fwd +# import fused_dense_cuda # from apex +import fused_dense_lib as fused_dense_cuda + from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.model.utils import Silu, fused_dense_func_torch -from typing import Optional -from functools import partial - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor -from torch.distributed import ProcessGroup -from torch.cuda.amp import custom_bwd, custom_fwd - -# import fused_dense_cuda # from apex -import fused_dense_lib as fused_dense_cuda - -from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_fwd, sqrelu_bwd -from flash_attn.utils.distributed import all_gather_raw, all_reduce_raw -# reduce_scatter_raw -from flash_attn.utils.distributed import reduce_scatter, all_reduce - -def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, op=torch.distributed.ReduceOp.SUM): - world_size = torch.distributed.get_world_size(process_group) - assert input_.shape[0] % world_size == 0 - output = torch.empty( - input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device - ) - handle = torch.distributed.reduce_scatter_tensor( - output, input_.contiguous(), op=op, group=process_group, async_op=async_op - ) - return output, handle class ScaleColumnParallelLinear(nn.Linear): """ @@ -231,7 +207,7 @@ def forward(self, x): out = self.w3(Silu(w1_o, w2_o)) return out -class FusedDenseFunc_fsdp(torch.autograd.Function): +class FSDPFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd @@ -243,21 +219,26 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None): if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) - x = x.contiguous() - total_x = x + total_x = x.contiguous() - # do all_gather for weight and bias before actual computation - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - if bias is not None: - total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - handle_bias.wait() + world_size = gpc.get_world_size(ParallelMode.TENSOR) + if world_size > 1: + # do all_gather for weight and bias before actual computation + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + if bias is not None: + total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + handle_bias.wait() + else: + total_bias = bias + handle_weight.wait() else: + total_weight = weight total_bias = bias if torch.is_autocast_enabled(): total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None - handle_weight.wait() + total_weight = total_weight.contiguous() batch_shape, n = total_x.shape[:-1], total_x.shape[-1] batch_dim = batch_shape.numel() @@ -289,9 +270,13 @@ def backward(ctx, grad_output, *args): batch_dim = batch_shape.numel() grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - # do all-gather for weight before backward - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() + world_size = gpc.get_world_size(ParallelMode.TENSOR) + if world_size > 1: + # do all-gather for weight before backward + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() + else: + total_weight = weight if ctx.needs_input_grad[0]: if not ctx.return_residual: @@ -300,32 +285,24 @@ def backward(ctx, grad_output, *args): grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - # if process_group is not None: - # import pdb; pdb.set_trace() - # grad_input, handle_grad_input = reduce_scatter_raw(grad_input, process_group, async_op=True) - # grad_input, handle_grad_input = all_reduce_raw(grad_input, process_group, async_op=True) - else: grad_input = None - # import pdb; pdb.set_trace() + if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) - grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) - if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - handle_grad_bias.wait() - handle_grad_weight.wait() - + if world_size > 1: + grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + if grad_bias is not None: + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + handle_grad_bias.wait() + handle_grad_weight.wait() else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None - # if process_group is not None and ctx.needs_input_grad[0]: - # handle_grad_input.wait() - # import pdb; pdb.set_trace() return grad_input, grad_weight, grad_bias, None, None, None @@ -334,7 +311,7 @@ def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = No dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16] or (x.dtype == torch.float32 and torch.is_autocast_enabled())) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FusedDenseFunc_fsdp.apply(x, weight, bias, return_residual, process_group) + return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) else: assert process_group is None out = F.linear(x, weight, bias) @@ -426,5 +403,5 @@ def __init__( def forward(self, x): w1_o = self.w1(x) w2_o = self.w2(x) - out = self.w3(Silu(w1_o, w2_o)) + out = self.w3(F.silu(w1_o) * w2_o) return out diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 8ac8c58d..47d706f6 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -74,6 +74,7 @@ def __init__( use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, + tp_mode: str = 'origin_tp', ): super().__init__() self.checkpoint = checkpoint @@ -98,6 +99,7 @@ def __init__( use_flash_attn=use_flash_attn, device=device, dtype=dtype, + tp_mode=tp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -109,16 +111,8 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - # self.mlp = FeedForward( - # hidden_size, - # int(hidden_size * mlp_ratio), - # out_features=hidden_size, - # process_group=gpc.get_group(ParallelMode.TENSOR), - # bias=False, - # device=device, - # dtype=dtype, - # ) - self.mlp = FSDPFeedForward( + mlp_cls = FeedForward if tp_mode == 'origin_tp' else FSDPFeedForward + self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, @@ -179,6 +173,7 @@ def reset_parameters(self): else: normal_(std=0.006 if "fc1" in name else 0.0015)(param.data) + def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None): if self.checkpoint and self.training: return activation_checkpoint( @@ -300,12 +295,12 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) + self.tp_mode = gpc.config.parallel["tensor"]["mode"] if is_reward: head_cls = RewardModelLinear else: - # head_cls = ScaleColumnParallelLinear - head_cls = FSDPScaleLinear + head_cls = ScaleColumnParallelLinear if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -346,6 +341,7 @@ def __init__( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, + tp_mode = self.tp_mode, ) for lid in range(num_layers) ] @@ -391,7 +387,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] - if gpc.config.parallel.sequence_parallel: + # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension. + if gpc.config.parallel.sequence_parallel and self.tp_mode == 'fstp': indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None @@ -408,8 +405,12 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N if hasattr(self, "norm"): hidden_states = self.norm(hidden_states.float()) if hasattr(self, "head"): + # if hidden_states.ndim == 3: + # import pdb; pdb.set_trace() + # hidden_states = self.head(hidden_states, dim=1) + # else: + # hidden_states = self.head(hidden_states) hidden_states = self.head(hidden_states) - hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=0) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index e6d0a297..8f7a064d 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -57,49 +57,29 @@ class DistributedAttention(torch.nn.Module): Arguments: local_attention (Module): local attention with q,k,v sequence_process_group (ProcessGroup): sequence parallel process group - scatter_idx (int): scatter_idx for all2all comm - gather_idx (int): gather_idx for all2all comm + first_scatter_idx (int): scatter_idx for the first all2all comm + first_gather_idx (int): gather_idx for the first all2all comm + second_scatter_idx (int): scatter_idx for the second all2all comm + second_gather_idx (int): gather_idx for the second all2all comm """ def __init__( self, local_attention: Module, sequence_process_group: dist.ProcessGroup, - scatter_idx: int = 2, - gather_idx: int = 0, + first_scatter_idx: int = 2, + first_gather_idx: int = 0, + second_scatter_idx: int = 0, + second_gather_idx: int = 1, ) -> None: super(DistributedAttention, self).__init__() self.local_attn = local_attention self.spg = sequence_process_group - self.scatter_idx = scatter_idx - self.gather_idx = gather_idx - - # def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor: - # """ forward - - # Arguments: - # query (Tensor): query input to the layer - # key (Tensor): key input to the layer - # value (Tensor): value input to the layer - # args: other args - - # Returns: - # * output (Tensor): context output - # """ - # # TODO Merge three alltoall calls into one - # #in shape : e.g., [s/p:h:] - # query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx) - # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx) - # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx) - - # #out shape : e.g., [s:h/p:] - # context_layer = self.local_attn(query_layer, key_layer, value_layer, *args) - - # output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx) - - # #out e.g., [s/p::h] - # return output + self.first_scatter_idx = first_scatter_idx + self.first_gather_idx = first_gather_idx + self.second_scatter_idx = second_scatter_idx + self.second_gather_idx = second_gather_idx def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: """ forward @@ -114,15 +94,21 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: * output (Tensor): context output """ # TODO Merge three alltoall calls into one - #in shape : e.g., [s/p:h:] - qkv = _SeqAllToAll.apply(self.spg, qkv, 2, 0) - # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx) - # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx) - - #out shape : e.g., [s:h/p:] - context_layer = self.local_attn(qkv, **kwargs) - - output = _SeqAllToAll.apply(self.spg, context_layer, 0, 1) + if qkv.ndim == 5: + # in shape: [seq/tp_size, 3, head, head_dim] + qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1) + #out shape : [seq, head/tp_size, head_dim] + context_layer = self.local_attn(qkv, **kwargs) + # in shape: [seq, head/tp_size, head_dim] + output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1) + else: + + # in shape: [seq/tp_size, 3, head, head_dim] + qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx) + #out shape : [seq, head/tp_size, head_dim] + context_layer = self.local_attn(qkv, **kwargs) + # in shape: [seq, head/tp_size, head_dim] + output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx) #out e.g., [s/p::h] return output @@ -171,6 +157,7 @@ def __init__( use_flash_attn: bool = True, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, + tp_mode: str = 'origin_tp', ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -198,16 +185,8 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - # self.Wqkv = ColumnParallelLinearTorch( - # embed_dim, - # 3 * embed_dim, - # process_group, - # bias=True, - # sequence_parallel=gpc.config.parallel.sequence_parallel, - # **factory_kwargs, - # ) # according to https://spaces.ac.cn/archives/9577 - - self.Wqkv = FSDPLinear( + Wqkv_cls = ColumnParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear + self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, process_group, @@ -222,25 +201,20 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - - self.inner_attn_sp = DistributedAttention(self.inner_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0) - self.inner_cross_attn_sp = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group, scatter_idx=3, gather_idx=0) + if tp_mode == 'fstp': + self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group) + self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) # output projection always have the bias (for now) - # self.out_proj = RowParallelLinearTorch( - # embed_dim, - # embed_dim, - # process_group, - # sequence_parallel=gpc.config.parallel.sequence_parallel, - # **factory_kwargs, - # ) - self.out_proj = FSDPLinear( + out_proj_cls = RowParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear + self.out_proj = out_proj_cls( embed_dim, embed_dim, process_group, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) + # need to assign tp attribute so that internlm know it is tensor parallel module if gpc.get_world_size(ParallelMode.TENSOR) > 1: for name in ["out_proj", "Wqkv"]: @@ -343,11 +317,9 @@ def _packed_forward(self, x, inference_params=None, **kwargs): with torch.cuda.amp.autocast(dtype=torch.bfloat16): if qkv.dtype not in [torch.float16, torch.bfloat16]: qkv = qkv.to(torch.bfloat16) - # context = self.inner_attn(qkv, **kwargs).to(x.dtype) - context = self.inner_attn_sp(qkv, **kwargs).to(x.dtype) + context = self.inner_attn(qkv, **kwargs).to(x.dtype) else: - # context = self.inner_attn(qkv, **kwargs) - context = self.inner_attn_sp(qkv, **kwargs) + context = self.inner_attn(qkv, **kwargs) else: raise RuntimeError("Not support this right now") diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 6a55fa56..2a11a478 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -54,7 +54,10 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape def switch_sequence_parallel_mode(): prev_mode = gpc.config.parallel.sequence_parallel try: - gpc.config.parallel.sequence_parallel = False + if gpc.config.parallel["tensor"]["mode"] == 'fstp': + gpc.config.parallel.sequence_parallel = True + else: + gpc.config.parallel.sequence_parallel = False yield finally: gpc.config.parallel.sequence_parallel = prev_mode From 21c1a7fa47bc49eca26dc63d33a1f57d855e15dd Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 18:01:06 +0800 Subject: [PATCH 005/153] support evaluation with fstp --- configs/7B_sft.py | 4 +- internlm/model/linear.py | 7 +- internlm/model/modeling_internlm.py | 10 +-- internlm/model/utils.py | 121 ++++++++++++++++++++++++++-- 4 files changed, 124 insertions(+), 18 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 6758167a..3e1d0780 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 4 +NUM_LAYER = 32 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -155,7 +155,7 @@ """ parallel = dict( zero1=-1, - tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp' + tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp' pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 60a3d272..fbe6f141 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -54,7 +54,7 @@ def __init__( self.process_group = process_group self.weight_scale = weight_scale - def forward(self, input): # pylint: disable=W0622 + def forward(self, input, gather_dim=0): # pylint: disable=W0622 # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. # If not, then the input is already gathered. @@ -68,6 +68,7 @@ def forward(self, input): # pylint: disable=W0622 self.bias, process_group=self.process_group, sequence_parallel=gpc.config.parallel.sequence_parallel, + gather_dim=gather_dim, ) @@ -121,13 +122,13 @@ def forward(self, input): # pylint: disable=W0622 class ColumnParallelLinearTorch(ColumnParallelLinear): - def forward(self, x): + def forward(self, x, gather_dim=0): # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. # If not, then the input is already gathered. return fused_dense_func_torch( - x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel + x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel, gather_dim=gather_dim, ) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 47d706f6..56a8efac 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -405,12 +405,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N if hasattr(self, "norm"): hidden_states = self.norm(hidden_states.float()) if hasattr(self, "head"): - # if hidden_states.ndim == 3: - # import pdb; pdb.set_trace() - # hidden_states = self.head(hidden_states, dim=1) - # else: - # hidden_states = self.head(hidden_states) - hidden_states = self.head(hidden_states) + if hidden_states.ndim == 3: + hidden_states = self.head(hidden_states, gather_dim=1) + else: + hidden_states = self.head(hidden_states) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 570a86f0..33c8c46e 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -5,16 +5,18 @@ import torch import torch.nn.functional as F -from flash_attn.ops.fused_dense import FusedDenseFunc +# from flash_attn.ops.fused_dense import FusedDenseFunc from flash_attn.utils.distributed import ( - all_gather_raw, + # all_gather_raw, all_reduce_raw, reduce_scatter_raw, ) from torch import Tensor -from torch.cuda.amp import custom_bwd +from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup +import fused_dense_lib as fused_dense_cuda + from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger @@ -94,6 +96,109 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): grad_bias = grad_output.sum(dim=0) if has_d_bias else None return grad_weight, grad_bias +def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0): + world_size = torch.distributed.get_world_size(process_group) + shape = list(input_.shape) + shape[gather_dim] = shape[gather_dim] * world_size + # output = torch.empty(world_size * input_.shape[0], *input_.shape[1:], + # dtype=input_.dtype, device=input_.device) + output = torch.empty(shape, dtype=input_.dtype, device=input_.device) + handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(), + group=process_group, async_op=async_op) + return output, handle + +class FusedDenseFunc(torch.autograd.Function): + + @staticmethod + @custom_fwd + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, + sequence_parallel=True, gather_dim=0): + """ + If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel + with sequence parallelism: we do an all_gather_raw of x before doing the matmul. + """ + ctx.compute_weight_gradient = weight.requires_grad + ctx.return_residual = return_residual + ctx.process_group = process_group + ctx.sequence_parallel = sequence_parallel + ctx.gather_dim = gather_dim + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + x = x.contiguous() + if process_group is not None and sequence_parallel: + # We want to kick off the all_gather early, before weight dtype conversion + total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim) + else: + total_x = x + + if torch.is_autocast_enabled(): + weight = weight.to(dtype=torch.get_autocast_gpu_dtype()) + bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + weight = weight.contiguous() + if process_group is not None and sequence_parallel: + handle_x.wait() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *weight.shape) > 65535 * 32: + raise RuntimeError('fused_dense only supports matrix dims <= 2M') + output = F.linear(total_x, weight, bias) + if ctx.compute_weight_gradient: + ctx.save_for_backward(x, weight) + else: + ctx.save_for_backward(weight) + return output if not return_residual else (output, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + grad_input, = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + sequence_parallel = ctx.sequence_parallel + gather_dim = ctx.gather_dim + + if ctx.compute_weight_gradient: + x, weight = ctx.saved_tensors + if process_group is not None and sequence_parallel: + total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim) + else: + total_x = x + else: + weight, = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + if process_group is not None: + reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw + grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) + else: + grad_input = None + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + if process_group is not None and sequence_parallel: + handle_x.wait() + grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + if process_group is not None and ctx.needs_input_grad[0]: + handle_grad_input.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFuncTorch(FusedDenseFunc): @@ -108,10 +213,11 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.contiguous() process_group = ctx.process_group sequence_parallel = ctx.sequence_parallel + gather_dim = ctx.gather_dim if ctx.compute_weight_gradient: x, weight = ctx.saved_tensors if process_group is not None and sequence_parallel: - total_x, handle_x = all_gather_raw(x, process_group, async_op=True) + total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim) else: total_x = x else: @@ -144,7 +250,7 @@ def backward(ctx, grad_output, *args): grad_bias = grad_output if ctx.needs_input_grad[2] else None if process_group is not None and ctx.needs_input_grad[0]: handle_grad_input.wait() - return grad_input, grad_weight, grad_bias, None, None, None + return grad_input, grad_weight, grad_bias, None, None, None, None def fused_dense_func_torch( @@ -154,14 +260,15 @@ def fused_dense_func_torch( return_residual: bool = False, process_group: Optional[ProcessGroup] = None, sequence_parallel: bool = True, + gather_dim: int = 0, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel) + return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) else: - return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel) + return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) class _SplitForwardGatherBackward(torch.autograd.Function): From 949431f228cdf0dbfdcd0909b905cb6075517eb6 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 18:06:22 +0800 Subject: [PATCH 006/153] modify the config --- configs/7B_sft.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 3e1d0780..dd4104ab 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -55,7 +55,7 @@ # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate - valid_every=10, + valid_every=50, pack_sample_into_one=False, total_steps=50000, skip_batches="", @@ -64,7 +64,7 @@ min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, + empty_cache_and_diag_interval=10, diag_outlier_ratio=1.1, ) @@ -135,7 +135,7 @@ num_layers=NUM_LAYER, mlp_ratio=MLP_RATIO, apply_post_layer_norm=False, - dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" norm_type="rmsnorm", layer_norm_epsilon=1e-5, use_flash_attn=True, @@ -155,9 +155,9 @@ """ parallel = dict( zero1=-1, - tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp' + tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, + sequence_parallel=False, ) cudnn_deterministic = False From 54e561665eb65f0212686051c943f73fd98c716f Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 18:08:15 +0800 Subject: [PATCH 007/153] remove useless code for no-pp --- internlm/core/scheduler/no_pipeline_scheduler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 97687904..6777acc5 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -202,10 +202,8 @@ def forward_backward_step( if return_output_label: outputs.append(_output) labels.append(_label) - if not return_output_label: outputs, labels = None, None - # Compatible for non-moe if hasattr(gpc.config.model, "num_experts"): return outputs, labels, loss, moe_loss From 144731c35c47171ab675e5fc9557468450a5a666 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 20:04:27 +0800 Subject: [PATCH 008/153] fix evaluation bug in pp --- internlm/initialize/launch.py | 2 +- internlm/utils/evaluation.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 3651a4c7..5bd2b73c 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -283,7 +283,7 @@ def args_sanity_check(): if gpc.config.parallel["tensor"].get("mode", None) is None: gpc.config.parallel["tensor"]["mode"] = "origin_tp" - if gpc.config.parallel["tensor"].get("mode", None) is 'fstp': + if gpc.config.parallel["tensor"].get("mode", None) == 'fstp': assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True." # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 2a11a478..148d19df 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -106,9 +106,15 @@ def evaluate_on_val_dls( total_val_bsz = len(batch[1]) assert total_val_bsz % data_cfg.micro_bsz == 0 num_microbatches = total_val_bsz // data_cfg.micro_bsz - tensor_shape = torch.Size( - [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.HIDDEN_SIZE] - ) + if gpc.config.parallel['tensor']['mode'] == 'fstp': + sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR) + tensor_shape = torch.Size( + [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1] // sequence_world_size, gpc.config.HIDDEN_SIZE] + ) + else: + tensor_shape = torch.Size( + [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.HIDDEN_SIZE] + ) with switch_evaluation_pipeline_scheduler( trainer=trainer, From ef9e7cc6221823a610e3a9b0c369745d7f1e1f71 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 20:05:39 +0800 Subject: [PATCH 009/153] modify the config --- configs/7B_sft.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index dd4104ab..4c55feea 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -154,8 +154,8 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=-1, - tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + zero1=8, + tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=False, ) From 5d39c332fe01d08736cc42ff5613cf887d9e34b6 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 20:08:49 +0800 Subject: [PATCH 010/153] restore train.py --- train.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 1adcc22a..139bac1f 100644 --- a/train.py +++ b/train.py @@ -110,6 +110,7 @@ def main(args): # initialize and resume train state train_state = TrainState(gpc.config, train_dl.batch_sampler) + optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) ckpt_manager = CheckpointManager( @@ -169,7 +170,6 @@ def main(args): beta2_scheduler=beta2_scheduler, scheduler_hooks=scheduler_hooks, ) - # initialize simple memory profiler if args.profiling: @@ -219,9 +219,21 @@ def main(args): # do forward and backward timer("fwd-bwd").start() - _, _, loss = trainer.execute_schedule( - batch, forward_only=False, return_loss=True, return_output_label=False - ) + moe_loss = None + if hasattr(gpc.config.model, "num_experts"): + _, _, loss, moe_loss = trainer.execute_schedule( + batch, + forward_only=False, + return_loss=True, + return_output_label=False, + ) + else: + _, _, loss = trainer.execute_schedule( + batch, + forward_only=False, + return_loss=True, + return_output_label=False, + ) timer("fwd-bwd").stop() # update parameters, and returns (success_update, grad_norm) @@ -254,7 +266,7 @@ def main(args): trainer=trainer, start_time=start_time, loss=loss, - moe_loss=None, + moe_loss=moe_loss, grad_norm=grad_norm_groups, metric=metric, update_panel=uniscale_logger is not None, From 29df765f65fe9797b6168008efd4dc3bf7b8cfd6 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 20:23:32 +0800 Subject: [PATCH 011/153] refactor code --- .../core/scheduler/no_pipeline_scheduler.py | 2 + internlm/model/linear.py | 118 +---------- internlm/model/utils.py | 183 ++++++++++++++---- 3 files changed, 151 insertions(+), 152 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 6777acc5..56661d8c 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -202,8 +202,10 @@ def forward_backward_step( if return_output_label: outputs.append(_output) labels.append(_label) + if not return_output_label: outputs, labels = None, None + # Compatible for non-moe if hasattr(gpc.config.model, "num_experts"): return outputs, labels, loss, moe_loss diff --git a/internlm/model/linear.py b/internlm/model/linear.py index fbe6f141..4075e9ee 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -6,17 +6,13 @@ import torch import torch.nn.functional as F from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear -from flash_attn.utils.distributed import all_reduce, reduce_scatter, all_gather_raw, reduce_scatter_raw -from torch import Tensor +from flash_attn.utils.distributed import all_reduce, reduce_scatter from torch import nn -from torch.cuda.amp import custom_bwd, custom_fwd -# import fused_dense_cuda # from apex -import fused_dense_lib as fused_dense_cuda from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import Silu, fused_dense_func_torch +from internlm.model.utils import Silu, fused_dense_func_torch, fsdp_fused_dense_func class ScaleColumnParallelLinear(nn.Linear): @@ -208,116 +204,6 @@ def forward(self, x): out = self.w3(Silu(w1_o, w2_o)) return out -class FSDPFusedDenseFunc(torch.autograd.Function): - - @staticmethod - @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None): - - ctx.compute_weight_gradient = weight.requires_grad - ctx.return_residual = return_residual - ctx.process_group = process_group - - if torch.is_autocast_enabled(): - x = x.to(dtype=torch.get_autocast_gpu_dtype()) - total_x = x.contiguous() - - world_size = gpc.get_world_size(ParallelMode.TENSOR) - if world_size > 1: - # do all_gather for weight and bias before actual computation - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - if bias is not None: - total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - handle_bias.wait() - else: - total_bias = bias - handle_weight.wait() - else: - total_weight = weight - total_bias = bias - - if torch.is_autocast_enabled(): - total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) - total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None - - total_weight = total_weight.contiguous() - batch_shape, n = total_x.shape[:-1], total_x.shape[-1] - batch_dim = batch_shape.numel() - # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 - if min(batch_dim, n, *total_weight.shape) > 65535 * 32: - raise RuntimeError('fused_dense only supports matrix dims <= 2M') - output = F.linear(total_x, total_weight, total_bias) - if ctx.compute_weight_gradient: - ctx.save_for_backward(x, weight) - else: - ctx.save_for_backward(weight) - return output if not return_residual else (output, x) - - @staticmethod - @custom_bwd - def backward(ctx, grad_output, *args): - grad_output = grad_output.contiguous() - if ctx.return_residual: - grad_input, = args - grad_input = grad_input.contiguous() - process_group = ctx.process_group - if ctx.compute_weight_gradient: - x, weight = ctx.saved_tensors - total_x = x - else: - weight, = ctx.saved_tensors - total_x = None - batch_shape = grad_output.shape[:-1] - batch_dim = batch_shape.numel() - grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - - world_size = gpc.get_world_size(ParallelMode.TENSOR) - if world_size > 1: - # do all-gather for weight before backward - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() - else: - total_weight = weight - - if ctx.needs_input_grad[0]: - if not ctx.return_residual: - grad_input = F.linear(grad_output, total_weight.t()) - else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, total_weight) - grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - else: - grad_input = None - - if ctx.needs_input_grad[1]: - assert ctx.compute_weight_gradient - - grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] - ) - if world_size > 1: - grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) - if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - handle_grad_bias.wait() - handle_grad_weight.wait() - else: - grad_weight = None - grad_bias = grad_output if ctx.needs_input_grad[2] else None - return grad_input, grad_weight, grad_bias, None, None, None - - -def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, - return_residual: bool = False, process_group = None): - dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16] - or (x.dtype == torch.float32 and torch.is_autocast_enabled())) - if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) - else: - assert process_group is None - out = F.linear(x, weight, bias) - return out if not return_residual else (out, x) - class FSDPLinear(ColumnParallelLinear): def forward(self, x): diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 33c8c46e..c8845440 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -5,9 +5,7 @@ import torch import torch.nn.functional as F -# from flash_attn.ops.fused_dense import FusedDenseFunc from flash_attn.utils.distributed import ( - # all_gather_raw, all_reduce_raw, reduce_scatter_raw, ) @@ -17,6 +15,7 @@ import fused_dense_lib as fused_dense_cuda +from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger @@ -90,23 +89,53 @@ def gather_forward_split_backward(input_, parallel_mode, dim): return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim) -def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): - assert my_input.dtype == grad_output.dtype - grad_weight = torch.matmul(grad_output.t(), my_input) - grad_bias = grad_output.sum(dim=0) if has_d_bias else None - return grad_weight, grad_bias +class _SplitForwardGatherBackward(torch.autograd.Function): + """ + Split the input and keep only the corresponding chuck to the rank. + + Args: + input_: input matrix. + parallel_mode: parallel mode. + dim: dimension + """ + + @staticmethod + def symbolic(input_): + return _split(input_, parallel_mode=None) + + @staticmethod + def forward(ctx, input_, parallel_mode, dim): + ctx.mode = parallel_mode + ctx.dim = dim + return _split(input_, parallel_mode, dim) + + @staticmethod + def backward(ctx, grad_output): + return _gather(grad_output, ctx.mode, ctx.dim), None, None + + +def split_forward_gather_backward(input_, parallel_mode, dim): + return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim) + def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0): world_size = torch.distributed.get_world_size(process_group) shape = list(input_.shape) shape[gather_dim] = shape[gather_dim] * world_size - # output = torch.empty(world_size * input_.shape[0], *input_.shape[1:], - # dtype=input_.dtype, device=input_.device) output = torch.empty(shape, dtype=input_.dtype, device=input_.device) handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(), group=process_group, async_op=async_op) return output, handle + +def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): + assert my_input.dtype == grad_output.dtype + grad_weight = torch.matmul(grad_output.t(), my_input) + grad_bias = grad_output.sum(dim=0) if has_d_bias else None + return grad_weight, grad_bias + + +# adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): @staticmethod @@ -253,6 +282,105 @@ def backward(ctx, grad_output, *args): return grad_input, grad_weight, grad_bias, None, None, None, None +class FSDPFusedDenseFunc(torch.autograd.Function): + + @staticmethod + @custom_fwd + def forward(ctx, x, weight, bias, return_residual=False, process_group=None): + + ctx.compute_weight_gradient = weight.requires_grad + ctx.return_residual = return_residual + ctx.process_group = process_group + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + total_x = x.contiguous() + + world_size = gpc.get_world_size(ParallelMode.TENSOR) + if world_size > 1: + # do all_gather for weight and bias before actual computation + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + if bias is not None: + total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + handle_bias.wait() + else: + total_bias = bias + handle_weight.wait() + else: + total_weight = weight + total_bias = bias + + if torch.is_autocast_enabled(): + total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) + total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + + total_weight = total_weight.contiguous() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *total_weight.shape) > 65535 * 32: + raise RuntimeError('fused_dense only supports matrix dims <= 2M') + output = F.linear(total_x, total_weight, total_bias) + if ctx.compute_weight_gradient: + ctx.save_for_backward(x, weight) + else: + ctx.save_for_backward(weight) + return output if not return_residual else (output, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + grad_input, = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + if ctx.compute_weight_gradient: + x, weight = ctx.saved_tensors + total_x = x + else: + weight, = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + + world_size = gpc.get_world_size(ParallelMode.TENSOR) + if world_size > 1: + # do all-gather for weight before backward + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() + else: + total_weight = weight + + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, total_weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, total_weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + else: + grad_input = None + + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + + grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + if world_size > 1: + grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + if grad_bias is not None: + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + handle_grad_bias.wait() + handle_grad_weight.wait() + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + return grad_input, grad_weight, grad_bias, None, None, None + + def fused_dense_func_torch( x: Tensor, weight: Tensor, @@ -271,33 +399,16 @@ def fused_dense_func_torch( return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) -class _SplitForwardGatherBackward(torch.autograd.Function): - """ - Split the input and keep only the corresponding chuck to the rank. - - Args: - input_: input matrix. - parallel_mode: parallel mode. - dim: dimension - """ - - @staticmethod - def symbolic(input_): - return _split(input_, parallel_mode=None) - - @staticmethod - def forward(ctx, input_, parallel_mode, dim): - ctx.mode = parallel_mode - ctx.dim = dim - return _split(input_, parallel_mode, dim) - - @staticmethod - def backward(ctx, grad_output): - return _gather(grad_output, ctx.mode, ctx.dim), None, None - - -def split_forward_gather_backward(input_, parallel_mode, dim): - return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim) +def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, + return_residual: bool = False, process_group = None): + dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16] + or (x.dtype == torch.float32 and torch.is_autocast_enabled())) + if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: + return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) + else: + assert process_group is None + out = F.linear(x, weight, bias) + return out if not return_residual else (out, x) def try_import_RMSNorm(): From f191853bf40e7c367161b5cd7fa3e1d1c321605b Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 20:39:57 +0800 Subject: [PATCH 012/153] fix lint --- internlm/initialize/launch.py | 10 +++-- internlm/model/linear.py | 42 ++++++----------- internlm/model/modeling_internlm.py | 20 +++++---- internlm/model/multi_head_attention.py | 62 +++++++++++++------------- internlm/model/utils.py | 62 +++++++++++++------------- internlm/utils/evaluation.py | 10 +++-- 6 files changed, 99 insertions(+), 107 deletions(-) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 5bd2b73c..8c224bf8 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -279,12 +279,14 @@ def args_sanity_check(): assert not ( gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False ), "sequence parallel does not support use_flash_attn=False" - + if gpc.config.parallel["tensor"].get("mode", None) is None: gpc.config.parallel["tensor"]["mode"] = "origin_tp" - - if gpc.config.parallel["tensor"].get("mode", None) == 'fstp': - assert gpc.config.parallel.sequence_parallel is True, "when the tp_mode is fstp, the sequence_parallel should be True." + + if gpc.config.parallel["tensor"].get("mode", None) == "fstp": + assert ( + gpc.config.parallel.sequence_parallel is True + ), "when the tp_mode is fstp, the sequence_parallel should be True." # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1: diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 4075e9ee..8e23871a 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -9,10 +9,9 @@ from flash_attn.utils.distributed import all_reduce, reduce_scatter from torch import nn - from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import Silu, fused_dense_func_torch, fsdp_fused_dense_func +from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch class ScaleColumnParallelLinear(nn.Linear): @@ -124,7 +123,12 @@ def forward(self, x, gather_dim=0): # If not, then the input is already gathered. return fused_dense_func_torch( - x, self.weight, self.bias, process_group=self.process_group, sequence_parallel=self.sequence_parallel, gather_dim=gather_dim, + x, + self.weight, + self.bias, + process_group=self.process_group, + sequence_parallel=self.sequence_parallel, + gather_dim=gather_dim, ) @@ -204,31 +208,13 @@ def forward(self, x): out = self.w3(Silu(w1_o, w2_o)) return out -class FSDPLinear(ColumnParallelLinear): - - def forward(self, x): - return fsdp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group) - -class FSDPScaleLinear(ScaleColumnParallelLinear): - - def forward(self, input): # pylint: disable=W0622 - # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: - # we do an all_gather of x before doing the matmul. - # If not, then the input is already gathered. - if self.weight_scale != 1: - weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() - else: - weight = self.weight - return fsdp_fused_dense_func( - input, - weight, - self.bias, - process_group=self.process_group, - ) +class FSTPLinear(ColumnParallelLinear): + def forward(self, x): + return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group) -class FSDPFeedForward(nn.Module): +class FSTPFeedForward(nn.Module): """ FeedForward. @@ -259,7 +245,7 @@ def __init__( hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - self.w1 = FSDPLinear( + self.w1 = FSTPLinear( in_features, hidden_features, process_group, @@ -268,7 +254,7 @@ def __init__( device=device, dtype=dtype, ) - self.w2 = FSDPLinear( + self.w2 = FSTPLinear( in_features, hidden_features, process_group, @@ -277,7 +263,7 @@ def __init__( device=device, dtype=dtype, ) - self.w3 = FSDPLinear( + self.w3 = FSTPLinear( hidden_features, out_features, process_group, diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 56a8efac..b8d7e60d 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -15,13 +15,16 @@ from internlm.model.embedding import Embedding1D from internlm.model.linear import ( FeedForward, + FSTPFeedForward, RewardModelLinear, ScaleColumnParallelLinear, - FSDPScaleLinear, - FSDPFeedForward, ) from internlm.model.multi_head_attention import MHA -from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm, split_forward_gather_backward +from internlm.model.utils import ( + gather_forward_split_backward, + split_forward_gather_backward, + try_import_RMSNorm, +) from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs @@ -74,7 +77,7 @@ def __init__( use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, - tp_mode: str = 'origin_tp', + tp_mode: str = "origin_tp", ): super().__init__() self.checkpoint = checkpoint @@ -111,7 +114,7 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - mlp_cls = FeedForward if tp_mode == 'origin_tp' else FSDPFeedForward + mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), @@ -173,7 +176,6 @@ def reset_parameters(self): else: normal_(std=0.006 if "fc1" in name else 0.0015)(param.data) - def forward(self, hidden_states, cu_seqlens=None, indexes=None, inference_params=None, max_seqlen=None): if self.checkpoint and self.training: return activation_checkpoint( @@ -341,7 +343,7 @@ def __init__( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, - tp_mode = self.tp_mode, + tp_mode=self.tp_mode, ) for lid in range(num_layers) ] @@ -388,9 +390,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension. - if gpc.config.parallel.sequence_parallel and self.tp_mode == 'fstp': + if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp": indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) - + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None for _, block in enumerate(self.blocks): diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 8f7a064d..287a0e2d 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -2,9 +2,10 @@ # -*- encoding: utf-8 -*- import warnings -from typing import Optional +from typing import Any, Optional, Tuple import torch +import torch.distributed as dist from einops import rearrange from flash_attn.modules.mha import ( CrossAttention, @@ -13,26 +14,25 @@ SelfAttention, _update_kv_cache, ) -from torch import nn +from torch import Tensor, nn +from torch.nn import Module from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode from internlm.core.context import global_context as gpc from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding -from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch, FSDPLinear - -import torch - -from typing import Any, Tuple -from torch import Tensor -from torch.nn import Module - -import torch.distributed as dist +from internlm.model.linear import ( + ColumnParallelLinearTorch, + FSTPLinear, + RowParallelLinearTorch, +) +# adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py class _SeqAllToAll(torch.autograd.Function): + "sequence alltoall" @staticmethod - def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor: + def forward(ctx: Any, group: dist.ProcessGroup, input_: Tensor, scatter_idx: int, gather_idx: int) -> Tensor: ctx.group = group ctx.scatter_idx = scatter_idx @@ -40,7 +40,7 @@ def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, seq_world_size = dist.get_world_size(group) - input_list = [t.contiguous() for t in torch.tensor_split(input, seq_world_size, scatter_idx)] + input_list = [t.contiguous() for t in torch.tensor_split(input_, seq_world_size, scatter_idx)] output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)] # TODO Use all_to_all_single instead dist.all_to_all(output_list, input_list, group=group) @@ -51,6 +51,7 @@ def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]: return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None) +# adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py class DistributedAttention(torch.nn.Module): """Initialization. @@ -73,16 +74,16 @@ def __init__( second_gather_idx: int = 1, ) -> None: - super(DistributedAttention, self).__init__() + super().__init__() self.local_attn = local_attention self.spg = sequence_process_group self.first_scatter_idx = first_scatter_idx self.first_gather_idx = first_gather_idx self.second_scatter_idx = second_scatter_idx self.second_gather_idx = second_gather_idx - + def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: - """ forward + """forward Arguments: query (Tensor): query input to the layer @@ -93,24 +94,25 @@ def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: Returns: * output (Tensor): context output """ - # TODO Merge three alltoall calls into one + # Evaluation if qkv.ndim == 5: - # in shape: [seq/tp_size, 3, head, head_dim] + # in shape: [batch, seq/tp_size, 3, head, head_dim] qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1) - #out shape : [seq, head/tp_size, head_dim] + # out shape : [batch, seq, head/tp_size, head_dim] context_layer = self.local_attn(qkv, **kwargs) - # in shape: [seq, head/tp_size, head_dim] - output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1) - else: - + # in shape: [batch, seq, head/tp_size, head_dim] + output = _SeqAllToAll.apply( + self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1 + ) + else: # training # in shape: [seq/tp_size, 3, head, head_dim] qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx) - #out shape : [seq, head/tp_size, head_dim] + # out shape : [seq, head/tp_size, head_dim] context_layer = self.local_attn(qkv, **kwargs) # in shape: [seq, head/tp_size, head_dim] output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx) - #out e.g., [s/p::h] + # out e.g., [s/p::h] return output @@ -157,7 +159,7 @@ def __init__( use_flash_attn: bool = True, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - tp_mode: str = 'origin_tp', + tp_mode: str = "origin_tp", ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -185,7 +187,7 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - Wqkv_cls = ColumnParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear + Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, @@ -201,12 +203,12 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - if tp_mode == 'fstp': + if tp_mode == "fstp": self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group) self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) # output projection always have the bias (for now) - out_proj_cls = RowParallelLinearTorch if tp_mode == 'origin_tp' else FSDPLinear + out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear self.out_proj = out_proj_cls( embed_dim, embed_dim, @@ -214,7 +216,6 @@ def __init__( sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) - # need to assign tp attribute so that internlm know it is tensor parallel module if gpc.get_world_size(ParallelMode.TENSOR) > 1: for name in ["out_proj", "Wqkv"]: @@ -311,7 +312,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs): qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim) # total x 3 x n_head x d qkv = self.rotary_emb(qkv, **kwargs) kwargs.pop("indexes") - if inference_params is None: if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn: with torch.cuda.amp.autocast(dtype=torch.bfloat16): diff --git a/internlm/model/utils.py b/internlm/model/utils.py index c8845440..67e89ad1 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -3,18 +3,14 @@ from typing import Optional +import fused_dense_lib as fused_dense_cuda import torch import torch.nn.functional as F -from flash_attn.utils.distributed import ( - all_reduce_raw, - reduce_scatter_raw, -) +from flash_attn.utils.distributed import all_reduce_raw, reduce_scatter_raw from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup -import fused_dense_lib as fused_dense_cuda - from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger @@ -123,8 +119,9 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = shape = list(input_.shape) shape[gather_dim] = shape[gather_dim] * world_size output = torch.empty(shape, dtype=input_.dtype, device=input_.device) - handle = torch.distributed.all_gather_into_tensor(output, input_.contiguous(), - group=process_group, async_op=async_op) + handle = torch.distributed.all_gather_into_tensor( + output, input_.contiguous(), group=process_group, async_op=async_op + ) return output, handle @@ -137,11 +134,11 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): + "tp fused dense function" @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, - sequence_parallel=True, gather_dim=0): + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0): """ If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel with sequence parallelism: we do an all_gather_raw of x before doing the matmul. @@ -171,7 +168,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, batch_dim = batch_shape.numel() # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 if min(batch_dim, n, *weight.shape) > 65535 * 32: - raise RuntimeError('fused_dense only supports matrix dims <= 2M') + raise RuntimeError("fused_dense only supports matrix dims <= 2M") output = F.linear(total_x, weight, bias) if ctx.compute_weight_gradient: ctx.save_for_backward(x, weight) @@ -184,12 +181,12 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, def backward(ctx, grad_output, *args): grad_output = grad_output.contiguous() if ctx.return_residual: - grad_input, = args + (grad_input,) = args grad_input = grad_input.contiguous() process_group = ctx.process_group sequence_parallel = ctx.sequence_parallel gather_dim = ctx.gather_dim - + if ctx.compute_weight_gradient: x, weight = ctx.saved_tensors if process_group is not None and sequence_parallel: @@ -197,7 +194,7 @@ def backward(ctx, grad_output, *args): else: total_x = x else: - weight, = ctx.saved_tensors + (weight,) = ctx.saved_tensors total_x = None batch_shape = grad_output.shape[:-1] batch_dim = batch_shape.numel() @@ -206,8 +203,7 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, weight) + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) if process_group is not None: reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw @@ -282,7 +278,8 @@ def backward(ctx, grad_output, *args): return grad_input, grad_weight, grad_bias, None, None, None, None -class FSDPFusedDenseFunc(torch.autograd.Function): +class FSTPFusedDenseFunc(torch.autograd.Function): + "FSTP fused dense function" @staticmethod @custom_fwd @@ -295,7 +292,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None): if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) total_x = x.contiguous() - + world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all_gather for weight and bias before actual computation @@ -313,13 +310,13 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None): if torch.is_autocast_enabled(): total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None - + total_weight = total_weight.contiguous() batch_shape, n = total_x.shape[:-1], total_x.shape[-1] batch_dim = batch_shape.numel() # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 if min(batch_dim, n, *total_weight.shape) > 65535 * 32: - raise RuntimeError('fused_dense only supports matrix dims <= 2M') + raise RuntimeError("fused_dense only supports matrix dims <= 2M") output = F.linear(total_x, total_weight, total_bias) if ctx.compute_weight_gradient: ctx.save_for_backward(x, weight) @@ -332,19 +329,19 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None): def backward(ctx, grad_output, *args): grad_output = grad_output.contiguous() if ctx.return_residual: - grad_input, = args + (grad_input,) = args grad_input = grad_input.contiguous() process_group = ctx.process_group if ctx.compute_weight_gradient: x, weight = ctx.saved_tensors total_x = x else: - weight, = ctx.saved_tensors + (weight,) = ctx.saved_tensors total_x = None batch_shape = grad_output.shape[:-1] batch_dim = batch_shape.numel() grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - + world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all-gather for weight before backward @@ -352,13 +349,12 @@ def backward(ctx, grad_output, *args): handle_weight.wait() else: total_weight = weight - + if ctx.needs_input_grad[0]: if not ctx.return_residual: grad_input = F.linear(grad_output, total_weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, total_weight) + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) else: grad_input = None @@ -372,7 +368,7 @@ def backward(ctx, grad_output, *args): if world_size > 1: grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) handle_grad_bias.wait() handle_grad_weight.wait() else: @@ -399,12 +395,14 @@ def fused_dense_func_torch( return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) -def fsdp_fused_dense_func(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, - return_residual: bool = False, process_group = None): - dtype_eligible = (x.dtype in [torch.float16, torch.bfloat16] - or (x.dtype == torch.float32 and torch.is_autocast_enabled())) +def fstp_fused_dense_func( + x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None +): + dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( + x.dtype == torch.float32 and torch.is_autocast_enabled() + ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSDPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) + return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) else: assert process_group is None out = F.linear(x, weight, bias) diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 148d19df..968a1db1 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -54,7 +54,7 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape def switch_sequence_parallel_mode(): prev_mode = gpc.config.parallel.sequence_parallel try: - if gpc.config.parallel["tensor"]["mode"] == 'fstp': + if gpc.config.parallel["tensor"]["mode"] == "fstp": gpc.config.parallel.sequence_parallel = True else: gpc.config.parallel.sequence_parallel = False @@ -106,10 +106,14 @@ def evaluate_on_val_dls( total_val_bsz = len(batch[1]) assert total_val_bsz % data_cfg.micro_bsz == 0 num_microbatches = total_val_bsz // data_cfg.micro_bsz - if gpc.config.parallel['tensor']['mode'] == 'fstp': + if gpc.config.parallel["tensor"]["mode"] == "fstp": sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR) tensor_shape = torch.Size( - [data_cfg.micro_bsz, batch[0]["input_ids"].shape[1] // sequence_world_size, gpc.config.HIDDEN_SIZE] + [ + data_cfg.micro_bsz, + batch[0]["input_ids"].shape[1] // sequence_world_size, + gpc.config.HIDDEN_SIZE, + ] ) else: tensor_shape = torch.Size( From a8dea6313fe85e6e762177c34b7786657fca89b1 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 21:33:26 +0800 Subject: [PATCH 013/153] fix the ci incompatible in config --- internlm/initialize/launch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 6a094e77..e5bd8610 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -305,9 +305,12 @@ def args_sanity_check(): gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False ), "sequence parallel does not support use_flash_attn=False" + if isinstance (gpc.config.parallel["tensor"], int): + gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode='origin_tp') + if gpc.config.parallel["tensor"].get("mode", None) is None: gpc.config.parallel["tensor"]["mode"] = "origin_tp" - + if gpc.config.parallel["tensor"].get("mode", None) == "fstp": assert ( gpc.config.parallel.sequence_parallel is True From 1b7935dd98d7879ae7effd1723ffa70e32869c5e Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 21:35:52 +0800 Subject: [PATCH 014/153] merge upstream develop --- internlm/model/multi_head_attention.py | 1 + 1 file changed, 1 insertion(+) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 287a0e2d..49578d77 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -6,6 +6,7 @@ import torch import torch.distributed as dist +import torch.nn.functional as F from einops import rearrange from flash_attn.modules.mha import ( CrossAttention, From db637542a614468365c2a9a2e2f6a720c158f11f Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 22:19:21 +0800 Subject: [PATCH 015/153] fix lint --- internlm/initialize/launch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index e5bd8610..80611fee 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -305,12 +305,12 @@ def args_sanity_check(): gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False ), "sequence parallel does not support use_flash_attn=False" - if isinstance (gpc.config.parallel["tensor"], int): - gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode='origin_tp') + if isinstance(gpc.config.parallel["tensor"], int): + gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="origin_tp") if gpc.config.parallel["tensor"].get("mode", None) is None: gpc.config.parallel["tensor"]["mode"] = "origin_tp" - + if gpc.config.parallel["tensor"].get("mode", None) == "fstp": assert ( gpc.config.parallel.sequence_parallel is True From 5fb6d99c112dbbc61bd95977dd13bd112a3e03f0 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 10 Oct 2023 11:45:11 +0800 Subject: [PATCH 016/153] feat(configs/7B_sft.py): update parallel config comment --- configs/7B_sft.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index d8557007..dee2f5eb 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -142,20 +142,27 @@ num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. ) """ -zero1 parallel: - 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. - 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. pipeline parallel (dict): 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. -tensor parallel: tensor parallel size, usually the number of GPUs per node. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. """ parallel = dict( zero1=dict(size=8, fsdp=False), - tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + tensor=dict(size=1, mode="origin_tp"), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=False, ) From 0fac845c3664bef850b8762526a55a1da9467206 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 10 Oct 2023 17:06:13 +0800 Subject: [PATCH 017/153] overlap grad_input computation and grad_weight reduce_scatter --- configs/7B_sft.py | 8 ++++---- internlm/model/modeling_internlm.py | 5 +++-- internlm/model/utils.py | 29 +++++++++++++++++------------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index d8557007..ac491215 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -1,7 +1,7 @@ JOB_NAME = "7b_train" DO_ALERT = False -SEQ_LEN = 2048 +SEQ_LEN = 4096 HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 @@ -154,10 +154,10 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=dict(size=8, fsdp=False), - tensor=dict(size=1, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + zero1=dict(size=1, fsdp=False), + tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=False, + sequence_parallel=True, ) cudnn_deterministic = False diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index b8d7e60d..228dbd34 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -407,10 +407,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N if hasattr(self, "norm"): hidden_states = self.norm(hidden_states.float()) if hasattr(self, "head"): + # Evaluation if hidden_states.ndim == 3: hidden_states = self.head(hidden_states, gather_dim=1) - else: - hidden_states = self.head(hidden_states) + else: # Training + hidden_states = self.head(hidden_states, gather_dim=0) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 67e89ad1..3885488b 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -349,16 +349,8 @@ def backward(ctx, grad_output, *args): handle_weight.wait() else: total_weight = weight - - if ctx.needs_input_grad[0]: - if not ctx.return_residual: - grad_input = F.linear(grad_output, total_weight.t()) - else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) - grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - else: - grad_input = None - + + # compute weight grad if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient @@ -369,11 +361,24 @@ def backward(ctx, grad_output, *args): grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) if grad_bias is not None: grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - handle_grad_bias.wait() - handle_grad_weight.wait() else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None + + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, total_weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + else: + grad_input = None + + if ctx.needs_input_grad[1]: + if world_size > 1: + handle_grad_weight.wait() + if grad_bias is not None: + handle_grad_bias.wait() return grad_input, grad_weight, grad_bias, None, None, None From 792b066f151c438a6ba653a8aafe9207a459907a Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 11 Oct 2023 10:57:12 +0800 Subject: [PATCH 018/153] communication overlap --- configs/7B_sft.py | 2 +- internlm/model/linear.py | 74 +++++++++++++++++++++++++++++++++++++++- internlm/model/utils.py | 6 ++-- 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 814966b1..e8be1677 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=1, fsdp=False), - tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 8e23871a..36f64f33 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Optional +from typing import Optional, Union, Any import torch import torch.nn.functional as F @@ -211,6 +211,7 @@ def forward(self, x): class FSTPLinear(ColumnParallelLinear): def forward(self, x): + import pdb; pdb.set_trace() return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group) @@ -278,3 +279,74 @@ def forward(self, x): w2_o = self.w2(x) out = self.w3(F.silu(w1_o) * w2_o) return out + +class FSTPAllGatherSyncHandler: + """ + All-gather handler for overlapping the all-gather in adjcent FSTP linear. + """ + + def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: + + self.process_group = process_group + self.FSTP_modules = [] + self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] + self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward + self.module_handler = dict() # key: FSTP module; value: all-gather handler + self.module_block = dict() # key: FSTP module; value: transformer block index + self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} + self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name + + # just want to share same for loop for ModuleList and Module + if not isinstance(model, nn.ModuleList): + model = [model] + + for _chunk in model: + if isinstance(_chunk, NaiveAMPModel): + _chunk = _chunk.model + + for _, children in _chunk.named_children(): + if isinstance(children, nn.ModuleList): + for _, block in enumerate(children): + index = 0 + sub_modules = list(block.children()) + if len(sub_modules) > 0: + for name, child in block.named_children(): + if isinstance(child, FSTPLinear): + self.FSTP_modules.append(child) + self.module_block[child] = _ + self.block_module[_][index] = child + self.module_name_index[child] = index + index = index + 1 + else: + continue + + + def _register_sync_parameters_hook(self) -> None: + """ + register pre_forward_hook and pre_backward_hook for FSTPLinear. + """ + + def _hook(module: nn.Module): + block_index = self.module_block[module] + name_index = self.module_name_index[module] + if name_index == 0: + next_module = self.block_module[block_index][name_index + 1] + self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.module_handler[next_module] = weights_handler + else: + handler = self.module_handler[module] + handler.wait() + if name_index != 4: + next_module = self.block_module[block_index][name_index + 1] + self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.module_handler[next_module] = weights_handler + + def _pre_forward_hook(module: nn.Module, inputs: Any): + _hook(module) + + def _pre_backward_hook(module: nn.Module, grad_input, grad_output): + _hook(module) + + for module in self.FSTP_modules: + module.register_forward_pre_hook(_pre_forward_hook) + module.register_backward_pre_hook(_pre_backward_hook) \ No newline at end of file diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 3885488b..5768f000 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Optional +from typing import Any, Optional, Union import fused_dense_lib as fused_dense_cuda import torch @@ -379,7 +379,7 @@ def backward(ctx, grad_output, *args): handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() - return grad_input, grad_weight, grad_bias, None, None, None + return grad_input, grad_weight, grad_bias, None, None def fused_dense_func_torch( @@ -453,3 +453,5 @@ def Silu(w1_o, w2_o): Silu = torch.jit.script(Silu) + + From 5fd5a8a32b0e045a499e8beb3f0438cb0bd49408 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 11 Oct 2023 17:36:41 +0800 Subject: [PATCH 019/153] support fine-grained overlap --- configs/7B_sft.py | 2 +- internlm/model/linear.py | 78 ++++++++++++++++++-------- internlm/model/multi_head_attention.py | 3 +- internlm/model/utils.py | 35 +++++++----- internlm/train/training_internlm.py | 8 ++- 5 files changed, 86 insertions(+), 40 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index e8be1677..814966b1 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=1, fsdp=False), - tensor=dict(size=2, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 36f64f33..42bd9f03 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -11,7 +11,8 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch +from internlm.core.naive_amp import NaiveAMPModel +from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch, all_gather_raw class ScaleColumnParallelLinear(nn.Linear): @@ -211,8 +212,7 @@ def forward(self, x): class FSTPLinear(ColumnParallelLinear): def forward(self, x): - import pdb; pdb.set_trace() - return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group) + return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler) class FSTPFeedForward(nn.Module): @@ -287,6 +287,7 @@ class FSTPAllGatherSyncHandler: def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: + # import pdb; pdb.set_trace() self.process_group = process_group self.FSTP_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] @@ -306,19 +307,21 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non for _, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): - for _, block in enumerate(children): + for idx, block in enumerate(children): index = 0 - sub_modules = list(block.children()) - if len(sub_modules) > 0: - for name, child in block.named_children(): - if isinstance(child, FSTPLinear): - self.FSTP_modules.append(child) - self.module_block[child] = _ - self.block_module[_][index] = child - self.module_name_index[child] = index - index = index + 1 - else: - continue + self.block_module[idx] = {} + for _, sub in block.named_children(): + sub_modules = list(sub.children()) + if len(sub_modules) > 0: + for name, child in sub.named_children(): + if isinstance(child, FSTPLinear): + self.FSTP_modules.append(child) + self.module_block[child] = idx + self.block_module[idx][index] = child + self.module_name_index[child] = index + index = index + 1 + else: + continue def _register_sync_parameters_hook(self) -> None: @@ -326,27 +329,58 @@ def _register_sync_parameters_hook(self) -> None: register pre_forward_hook and pre_backward_hook for FSTPLinear. """ - def _hook(module: nn.Module): + def _pre_forward_hook(module: nn.Module, inputs: Any): block_index = self.module_block[module] name_index = self.module_name_index[module] if name_index == 0: + total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler.wait() + self.FSTP_global_weights[module] = total_weight + + # start the all-gather for next module next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) self.module_handler[next_module] = weights_handler else: handler = self.module_handler[module] handler.wait() if name_index != 4: next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights, weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) self.module_handler[next_module] = weights_handler - def _pre_forward_hook(module: nn.Module, inputs: Any): - _hook(module) + def _post_forward_hook(module: nn.Module, input, output): + del self.FSTP_global_weights[module] + del self.module_handler[module] def _pre_backward_hook(module: nn.Module, grad_input, grad_output): - _hook(module) + block_index = self.module_block[module] + name_index = self.module_name_index[module] + if name_index == 4: + total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler.wait() + self.FSTP_global_weights[module] = total_weight + + # start the all-gather for next module + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.module_handler[next_module] = weights_handler + else: + handler = self.module_handler[module] + handler.wait() + if name_index != 0: + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.module_handler[next_module] = weights_handler + + def _post_backward_hook(module, grad_input, grad_output): + del self.FSTP_global_weights[module] for module in self.FSTP_modules: + # import pdb; pdb.set_trace() module.register_forward_pre_hook(_pre_forward_hook) - module.register_backward_pre_hook(_pre_backward_hook) \ No newline at end of file + module.register_forward_hook(_post_forward_hook) + # module.register_backward_pre_hook(_pre_backward_hook) + # module.register_backward_hook(_post_backward_hook) + module.register_module_full_backward_pre_hook(_pre_backward_hook) + \ No newline at end of file diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 436caf77..1db98d7e 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -210,7 +210,7 @@ def __init__( embed_dim, 3 * embed_dim, process_group, - bias=True, + bias=False, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) # according to https://spaces.ac.cn/archives/9577 @@ -231,6 +231,7 @@ def __init__( embed_dim, embed_dim, process_group, + bias=False, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 5768f000..50b9bbd7 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -283,11 +283,13 @@ class FSTPFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None): + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group + ctx.all_gather_handler = all_gather_handler + ctx.module = module if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -295,14 +297,16 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: - # do all_gather for weight and bias before actual computation - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - if bias is not None: - total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - handle_bias.wait() - else: - total_bias = bias - handle_weight.wait() + total_weight = all_gather_handler.FSTP_global_weights[module] + total_bias = bias + # # do all_gather for weight and bias before actual computation + # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + # if bias is not None: + # total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + # handle_bias.wait() + # else: + # total_bias = bias + # handle_weight.wait() else: total_weight = weight total_bias = bias @@ -332,6 +336,8 @@ def backward(ctx, grad_output, *args): (grad_input,) = args grad_input = grad_input.contiguous() process_group = ctx.process_group + all_gather_handler = ctx.all_gather_handler + module = ctx.module if ctx.compute_weight_gradient: x, weight = ctx.saved_tensors total_x = x @@ -345,8 +351,9 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all-gather for weight before backward - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() + # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + # handle_weight.wait() + total_weight = all_gather_handler.FSTP_global_weights[module] else: total_weight = weight @@ -379,7 +386,7 @@ def backward(ctx, grad_output, *args): handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() - return grad_input, grad_weight, grad_bias, None, None + return grad_input, grad_weight, grad_bias, None, None, None, None def fused_dense_func_torch( @@ -401,13 +408,13 @@ def fused_dense_func_torch( def fstp_fused_dense_func( - x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None + x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None, module=None, handler=None ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group) + return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler) else: assert process_group is None out = F.linear(x, weight, bias) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 7af58ddf..5deb0233 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -39,6 +39,7 @@ FeedForward, RewardModelLinear, ScaleColumnParallelLinear, + FSTPAllGatherSyncHandler, ) from internlm.model.multi_head_attention import MHA from internlm.model.utils import try_import_RMSNorm @@ -106,10 +107,13 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - + + if gpc.config.parallel["tensor"]["mode"] == "fstp": + handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + handler._register_sync_parameters_hook() + gpc.config.fstp_handler = handler return model - def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): if gpc.config.parallel.zero1.fsdp: # set wrap_policy for fsdp wrap From d0b1346993a493f3c7b5d1b109eba41731711002 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 12 Oct 2023 19:42:08 +0800 Subject: [PATCH 020/153] feat(model/linear.py): support block allgather overlap --- internlm/model/linear.py | 207 ++++++++++++++++++++++++---- internlm/model/utils.py | 45 +++--- internlm/train/training_internlm.py | 8 +- 3 files changed, 212 insertions(+), 48 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 42bd9f03..3e37863d 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Optional, Union, Any +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -12,7 +12,12 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel -from internlm.model.utils import Silu, fstp_fused_dense_func, fused_dense_func_torch, all_gather_raw +from internlm.model.utils import ( + Silu, + all_gather_raw, + fstp_fused_dense_func, + fused_dense_func_torch, +) class ScaleColumnParallelLinear(nn.Linear): @@ -212,7 +217,9 @@ def forward(self, x): class FSTPLinear(ColumnParallelLinear): def forward(self, x): - return fstp_fused_dense_func(x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler) + return fstp_fused_dense_func( + x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler + ) class FSTPFeedForward(nn.Module): @@ -280,31 +287,31 @@ def forward(self, x): out = self.w3(F.silu(w1_o) * w2_o) return out + class FSTPAllGatherSyncHandler: """ All-gather handler for overlapping the all-gather in adjcent FSTP linear. """ - + def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: - # import pdb; pdb.set_trace() self.process_group = process_group self.FSTP_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward - self.module_handler = dict() # key: FSTP module; value: all-gather handler - self.module_block = dict() # key: FSTP module; value: transformer block index - self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} - self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name - + self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward + self.module_handler = dict() # key: FSTP module; value: all-gather handler + self.module_block = dict() # key: FSTP module; value: transformer block index + self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} + self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name + # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): model = [model] - + for _chunk in model: if isinstance(_chunk, NaiveAMPModel): _chunk = _chunk.model - + for _, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): for idx, block in enumerate(children): @@ -322,13 +329,12 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non index = index + 1 else: continue - - + def _register_sync_parameters_hook(self) -> None: """ register pre_forward_hook and pre_backward_hook for FSTPLinear. """ - + def _pre_forward_hook(module: nn.Module, inputs: Any): block_index = self.module_block[module] name_index = self.module_name_index[module] @@ -336,19 +342,23 @@ def _pre_forward_hook(module: nn.Module, inputs: Any): total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) weight_handler.wait() self.FSTP_global_weights[module] = total_weight - + # start the all-gather for next module next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) self.module_handler[next_module] = weights_handler else: handler = self.module_handler[module] handler.wait() if name_index != 4: next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) self.module_handler[next_module] = weights_handler - + def _post_forward_hook(module: nn.Module, input, output): del self.FSTP_global_weights[module] del self.module_handler[module] @@ -360,22 +370,26 @@ def _pre_backward_hook(module: nn.Module, grad_input, grad_output): total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) weight_handler.wait() self.FSTP_global_weights[module] = total_weight - + # start the all-gather for next module next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) self.module_handler[next_module] = weights_handler else: handler = self.module_handler[module] handler.wait() if name_index != 0: next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw(next_module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) self.module_handler[next_module] = weights_handler - + def _post_backward_hook(module, grad_input, grad_output): del self.FSTP_global_weights[module] - + for module in self.FSTP_modules: # import pdb; pdb.set_trace() module.register_forward_pre_hook(_pre_forward_hook) @@ -383,4 +397,145 @@ def _post_backward_hook(module, grad_input, grad_output): # module.register_backward_pre_hook(_pre_backward_hook) # module.register_backward_hook(_post_backward_hook) module.register_module_full_backward_pre_hook(_pre_backward_hook) - \ No newline at end of file + + +class CoarseGrainedFSTPAllGatherSyncHandler: + """ + All-gather handler for overlapping the all-gather in adjcent FSTP block. + """ + + def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: + # import pdb; pdb.set_trace() + self.process_group = process_group + self.FSTP_blocks = [] + self.FSTP_outs = [] + self.FSTP_wqkvs = [] + self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] + self.FSTP_global_handle = dict() # key: FSTP module; value: module global all-gather op handle + self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward + self.block_handles = dict() # key: transformer block; value: all-gather handles + self.module_to_index = dict() # key: FSTP module; value: transformer block index + self.block_to_index = dict() # key: transformer block; value: transformer block index + self.index_to_block = dict() # key: transformer block index; value: transformer block + self.index_to_fsdp_modules = dict() # key: transformer block index; value: fsdp modules + + # just want to share same for loop for ModuleList and Module + if not isinstance(model, nn.ModuleList): + model = [model] + + for _chunk in model: + if isinstance(_chunk, NaiveAMPModel): + _chunk = _chunk.model + + for _, children in _chunk.named_children(): + if isinstance(children, nn.ModuleList): + for idx, block in enumerate(children): + self.FSTP_blocks.append(block) + self.block_to_index[block] = idx + self.index_to_block[idx] = block + self.index_to_fsdp_modules[idx] = [] + for _, sub in block.named_children(): + sub_modules = list(sub.children()) + if len(sub_modules) > 0: + for name, child in sub.named_children(): + # print(f"name: {name}", flush=True) + if name == "out_proj": + self.FSTP_outs.append(child) + self.module_to_index[child] = idx + if name == "Wqkv": + self.FSTP_wqkvs.append(child) + self.module_to_index[child] = idx + if isinstance(child, FSTPLinear): + self.index_to_fsdp_modules[idx].append(child) + else: + continue + + def _all_gather_block_weight(self, block_index: int): + block = self.index_to_block[block_index] + fsdp_modules = self.index_to_fsdp_modules[block_index] + self.block_handles[block] = [] + for module in fsdp_modules: + total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) + self.FSTP_global_weights[module] = total_weight + self.block_handles[block].append(weight_handle) + + def _register_sync_parameters_hook(self) -> None: + """ + register pre_forward_hook and pre_backward_hook for FSTP block. + + Notice that next block's all_gather op should be after current block's all_to_all op, so we + 1. register pre_forward_hook @out_proj module to prefetch for next block + 2. register pre_forward_hook @block module to wait handles for next block + 3. register pre_backward_hook @wqkv module to prefetch for next block + 4. register pre_backward_hook @block module to wait handles for next block + """ + + def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): + block_index = self.module_to_index[module] + # start the all-gather for next block + if block_index + 1 < gpc.config.NUM_LAYER: + self._all_gather_block_weight(block_index + 1) + + def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): + block_index = self.block_to_index[block] + if block_index == 0: + # all gather weight for block 0 + fsdp_modules = self.index_to_fsdp_modules[block_index] + for module in fsdp_modules: + total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handle.wait() + self.FSTP_global_weights[module] = total_weight + else: + # wait handle for current block + handles = self.block_handles[block] + for handle in handles: + handle.wait() + + def _post_forward_hook_for_block(block: nn.Module, input, output): + block_index = self.block_to_index[block] + fsdp_modules = self.index_to_fsdp_modules[block_index] + if block in self.block_handles: + del self.block_handles[block] + for module in fsdp_modules: + del self.FSTP_global_weights[module] + + def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output): + block_index = self.module_to_index[module] + # start the all-gather for next block + if block_index - 1 >= 0: + self._all_gather_block_weight(block_index - 1) + + def _pre_backward_hook_for_block(block: nn.Module, grad_output): + block_index = self.block_to_index[block] + if block_index == gpc.config.NUM_LAYER - 1: + # all gather weight for the last block + fsdp_modules = self.index_to_fsdp_modules[block_index] + for module in fsdp_modules: + total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handle.wait() + self.FSTP_global_weights[module] = total_weight + else: + # wait handle for current block + handles = self.block_handles[block] + for handle in handles: + handle.wait() + + def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): + block_index = self.block_to_index[block] + fsdp_modules = self.index_to_fsdp_modules[block_index] + if block in self.block_handles: + del self.block_handles[block] + for module in fsdp_modules: + del self.FSTP_global_weights[module] + + for block in self.FSTP_blocks: + block.register_forward_pre_hook(_pre_forward_hook_for_block) + block.register_forward_hook(_post_forward_hook_for_block) + block.register_full_backward_pre_hook(_pre_backward_hook_for_block) + block.register_full_backward_hook(_post_backward_hook_for_block) + + for out_proj in self.FSTP_outs: + out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) + + for wqkv in self.FSTP_wqkvs: + wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 50b9bbd7..97319d98 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -284,7 +284,6 @@ class FSTPFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None): - ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group @@ -297,16 +296,18 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: - total_weight = all_gather_handler.FSTP_global_weights[module] - total_bias = bias - # # do all_gather for weight and bias before actual computation - # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - # if bias is not None: - # total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - # handle_bias.wait() - # else: - # total_bias = bias - # handle_weight.wait() + # do all_gather for weight and bias before actual computation + if module in all_gather_handler.FSTP_global_weights: + total_weight = all_gather_handler.FSTP_global_weights[module] + else: + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() + + if bias is not None: + total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + handle_bias.wait() + else: + total_bias = bias else: total_weight = weight total_bias = bias @@ -351,12 +352,14 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all-gather for weight before backward - # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - # handle_weight.wait() - total_weight = all_gather_handler.FSTP_global_weights[module] + if module in all_gather_handler.FSTP_global_weights: + total_weight = all_gather_handler.FSTP_global_weights[module] + else: + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() else: total_weight = weight - + # compute weight grad if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient @@ -380,7 +383,7 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) else: grad_input = None - + if ctx.needs_input_grad[1]: if world_size > 1: handle_grad_weight.wait() @@ -408,7 +411,13 @@ def fused_dense_func_torch( def fstp_fused_dense_func( - x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, process_group=None, module=None, handler=None + x: Tensor, + weight: Tensor, + bias: Optional[Tensor] = None, + return_residual: bool = False, + process_group=None, + module=None, + handler=None, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() @@ -460,5 +469,3 @@ def Silu(w1_o, w2_o): Silu = torch.jit.script(Silu) - - diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 5deb0233..da59803c 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -36,10 +36,11 @@ from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data from internlm.model.embedding import Embedding1D from internlm.model.linear import ( + CoarseGrainedFSTPAllGatherSyncHandler, FeedForward, + FSTPAllGatherSyncHandler, RewardModelLinear, ScaleColumnParallelLinear, - FSTPAllGatherSyncHandler, ) from internlm.model.multi_head_attention import MHA from internlm.model.utils import try_import_RMSNorm @@ -107,13 +108,14 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - + if gpc.config.parallel["tensor"]["mode"] == "fstp": - handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler return model + def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): if gpc.config.parallel.zero1.fsdp: # set wrap_policy for fsdp wrap From d0f0c22cace187e62890aa34c3a0595115ceb394 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 13 Oct 2023 11:10:23 +0800 Subject: [PATCH 021/153] feat(model/linear.py): change pre backward from wqkv to block --- internlm/model/linear.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 3e37863d..56929eea 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -520,6 +520,10 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output): for handle in handles: handle.wait() + # start the all-gather for next block + if block_index - 1 >= 0: + self._all_gather_block_weight(block_index - 1) + def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): block_index = self.block_to_index[block] fsdp_modules = self.index_to_fsdp_modules[block_index] @@ -537,5 +541,5 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) - for wqkv in self.FSTP_wqkvs: - wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) + # for wqkv in self.FSTP_wqkvs: + # wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) From 82204eea59862b01c5aca68cad26c5060b1b7b16 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 16 Oct 2023 16:35:14 +0800 Subject: [PATCH 022/153] support hybrid overlap --- configs/7B_sft.py | 4 +- internlm/model/linear.py | 82 +++++++++++++++++++++++++---- internlm/train/training_internlm.py | 3 +- 3 files changed, 75 insertions(+), 14 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 814966b1..98bceeb4 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -2,10 +2,10 @@ DO_ALERT = False SEQ_LEN = 4096 -HIDDEN_SIZE = 4096 +HIDDEN_SIZE = 8192 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 32 +NUM_LAYER = 8 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 56929eea..890f1cb0 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -360,10 +360,12 @@ def _pre_forward_hook(module: nn.Module, inputs: Any): self.module_handler[next_module] = weights_handler def _post_forward_hook(module: nn.Module, input, output): - del self.FSTP_global_weights[module] - del self.module_handler[module] + if module in self.FSTP_global_weights: + del self.FSTP_global_weights[module] + if module in self.module_handler: + del self.module_handler[module] - def _pre_backward_hook(module: nn.Module, grad_input, grad_output): + def _pre_backward_hook(module: nn.Module, grad_output): block_index = self.module_block[module] name_index = self.module_name_index[module] if name_index == 4: @@ -396,7 +398,8 @@ def _post_backward_hook(module, grad_input, grad_output): module.register_forward_hook(_post_forward_hook) # module.register_backward_pre_hook(_pre_backward_hook) # module.register_backward_hook(_post_backward_hook) - module.register_module_full_backward_pre_hook(_pre_backward_hook) + module.register_full_backward_pre_hook(_pre_backward_hook) + module.register_full_backward_hook(_post_backward_hook) class CoarseGrainedFSTPAllGatherSyncHandler: @@ -410,6 +413,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.FSTP_blocks = [] self.FSTP_outs = [] self.FSTP_wqkvs = [] + self.FSTP_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] self.FSTP_global_handle = dict() # key: FSTP module; value: module global all-gather op handle self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward @@ -418,6 +422,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.block_to_index = dict() # key: transformer block; value: transformer block index self.index_to_block = dict() # key: transformer block index; value: transformer block self.index_to_fsdp_modules = dict() # key: transformer block index; value: fsdp modules + self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name + self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): @@ -430,6 +436,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non for _, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): for idx, block in enumerate(children): + index = 0 + self.block_module[idx] = {} self.FSTP_blocks.append(block) self.block_to_index[block] = idx self.index_to_block[idx] = block @@ -441,12 +449,17 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non # print(f"name: {name}", flush=True) if name == "out_proj": self.FSTP_outs.append(child) - self.module_to_index[child] = idx + # self.module_to_index[child] = idx if name == "Wqkv": self.FSTP_wqkvs.append(child) - self.module_to_index[child] = idx + # self.module_to_index[child] = idx if isinstance(child, FSTPLinear): + self.module_to_index[child] = idx + self.block_module[idx][index] = child + self.FSTP_modules.append(child) self.index_to_fsdp_modules[idx].append(child) + self.module_name_index[child] = index + index = index + 1 else: continue @@ -457,6 +470,7 @@ def _all_gather_block_weight(self, block_index: int): for module in fsdp_modules: total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) self.FSTP_global_weights[module] = total_weight + self.FSTP_global_handle[module] = weight_handle self.block_handles[block].append(weight_handle) def _register_sync_parameters_hook(self) -> None: @@ -498,6 +512,19 @@ def _post_forward_hook_for_block(block: nn.Module, input, output): del self.block_handles[block] for module in fsdp_modules: del self.FSTP_global_weights[module] + + + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): + block_index = self.module_to_index[module] + if block_index != 0: + handler = self.FSTP_global_handle[module] + handler.wait() + + def _post_forward_hook_for_module(module: nn.Module, input, output): + if module in self.FSTP_global_weights: + del self.FSTP_global_weights[module] + if module in self.FSTP_global_handle: + del self.FSTP_global_handle[module] def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output): block_index = self.module_to_index[module] @@ -531,15 +558,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): del self.block_handles[block] for module in fsdp_modules: del self.FSTP_global_weights[module] + + def _pre_backward_hook_for_module(module: nn.Module, grad_output): + block_index = self.module_to_index[module] + name_index = self.module_name_index[module] + if name_index == 4: + total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler.wait() + self.FSTP_global_weights[module] = total_weight - for block in self.FSTP_blocks: - block.register_forward_pre_hook(_pre_forward_hook_for_block) - block.register_forward_hook(_post_forward_hook_for_block) - block.register_full_backward_pre_hook(_pre_backward_hook_for_block) - block.register_full_backward_hook(_post_backward_hook_for_block) + # start the all-gather for next module + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + else: + handler = self.FSTP_global_handle[module] + handler.wait() + if name_index != 0: + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + + def _post_backward_hook_for_module(module, grad_input, grad_output): + del self.FSTP_global_weights[module] + + # for block in self.FSTP_blocks: + # block.register_forward_pre_hook(_pre_forward_hook_for_block) + # block.register_forward_hook(_post_forward_hook_for_block) + # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) + # block.register_full_backward_hook(_post_backward_hook_for_block) for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) # for wqkv in self.FSTP_wqkvs: # wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) + + for module in self.FSTP_modules: + module.register_forward_pre_hook(_pre_forward_hook_for_module) + module.register_forward_hook(_post_forward_hook_for_module) + module.register_full_backward_pre_hook(_pre_backward_hook_for_module) + module.register_full_backward_hook(_post_backward_hook_for_module) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index da59803c..572adbad 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -110,7 +110,8 @@ def initialize_model(): model = wrap_FSDP_model(model) if gpc.config.parallel["tensor"]["mode"] == "fstp": - handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + # handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler return model From 0d1fa037ddd3c899e3c42fbb9c013b17c4dd03dc Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 16 Oct 2023 20:13:59 +0800 Subject: [PATCH 023/153] feat(model/linear.py): set block 0 full weight --- internlm/model/linear.py | 133 +++++++++++++++---------- internlm/model/modeling_internlm.py | 6 +- internlm/model/multi_head_attention.py | 53 ++++++---- internlm/train/training_internlm.py | 13 ++- 4 files changed, 131 insertions(+), 74 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 890f1cb0..8a17c719 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -175,6 +175,7 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, + block_idx: int = 0, ): super().__init__() @@ -248,38 +249,62 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, + block_idx: int = 0, ): super().__init__() hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - self.w1 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w2 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w3 = FSTPLinear( - hidden_features, - out_features, - process_group, - bias=bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) + if block_idx == 0 and gpc.config.parallel.block_0_full_weight: + self.w1 = nn.Linear( + in_features, + hidden_features, + bias, + device=device, + dtype=dtype, + ) + self.w2 = nn.Linear( + in_features, + hidden_features, + bias, + device=device, + dtype=dtype, + ) + self.w3 = nn.Linear( + hidden_features, + out_features, + bias=bias, + device=device, + dtype=dtype, + ) + else: + self.w1 = FSTPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w2 = FSTPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w3 = FSTPLinear( + hidden_features, + out_features, + process_group, + bias=bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) def forward(self, x): w1_o = self.w1(x) @@ -449,10 +474,10 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non # print(f"name: {name}", flush=True) if name == "out_proj": self.FSTP_outs.append(child) - # self.module_to_index[child] = idx + self.module_to_index[child] = idx if name == "Wqkv": self.FSTP_wqkvs.append(child) - # self.module_to_index[child] = idx + self.module_to_index[child] = idx if isinstance(child, FSTPLinear): self.module_to_index[child] = idx self.block_module[idx][index] = child @@ -489,6 +514,7 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # start the all-gather for next block if block_index + 1 < gpc.config.NUM_LAYER: self._all_gather_block_weight(block_index + 1) + # print(f"_all_gather_block_weight for block {block_index+1}", flush=True) def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): block_index = self.block_to_index[block] @@ -512,14 +538,13 @@ def _post_forward_hook_for_block(block: nn.Module, input, output): del self.block_handles[block] for module in fsdp_modules: del self.FSTP_global_weights[module] - - + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): block_index = self.module_to_index[module] if block_index != 0: handler = self.FSTP_global_handle[module] handler.wait() - + def _post_forward_hook_for_module(module: nn.Module, input, output): if module in self.FSTP_global_weights: del self.FSTP_global_weights[module] @@ -558,46 +583,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): del self.block_handles[block] for module in fsdp_modules: del self.FSTP_global_weights[module] - + def _pre_backward_hook_for_module(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] - if name_index == 4: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight + if block_index != 0: + if name_index == 4: + total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler.wait() + self.FSTP_global_weights[module] = total_weight - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - else: - handler = self.FSTP_global_handle[module] - handler.wait() - if name_index != 0: + # start the all-gather for next module next_module = self.block_module[block_index][name_index - 1] self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( next_module.weight, self.process_group, async_op=True ) self.FSTP_global_handle[next_module] = weights_handler + else: + handler = self.FSTP_global_handle[module] + handler.wait() + if name_index != 0: + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler def _post_backward_hook_for_module(module, grad_input, grad_output): - del self.FSTP_global_weights[module] + if module in self.FSTP_global_weights: + del self.FSTP_global_weights[module] # for block in self.FSTP_blocks: - # block.register_forward_pre_hook(_pre_forward_hook_for_block) - # block.register_forward_hook(_post_forward_hook_for_block) - # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) - # block.register_full_backward_hook(_post_backward_hook_for_block) + # block.register_forward_pre_hook(_pre_forward_hook_for_block) + # block.register_forward_hook(_post_forward_hook_for_block) + # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) + # block.register_full_backward_hook(_post_backward_hook_for_block) for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) # for wqkv in self.FSTP_wqkvs: # wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) - + for module in self.FSTP_modules: module.register_forward_pre_hook(_pre_forward_hook_for_module) module.register_forward_hook(_post_forward_hook_for_module) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 228dbd34..cb933960 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -78,6 +78,7 @@ def __init__( use_swiglu: bool = True, use_flash_attn: bool = True, tp_mode: str = "origin_tp", + block_idx: int = 0, ): super().__init__() self.checkpoint = checkpoint @@ -103,6 +104,7 @@ def __init__( device=device, dtype=dtype, tp_mode=tp_mode, + block_idx=block_idx, ) self.dropout1 = nn.Dropout(drop_rate) @@ -123,6 +125,7 @@ def __init__( bias=False, device=device, dtype=dtype, + block_idx=block_idx, ) else: self.mlp = ParallelFusedMLP( @@ -344,6 +347,7 @@ def __init__( use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, tp_mode=self.tp_mode, + block_idx=lid, ) for lid in range(num_layers) ] @@ -410,7 +414,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N # Evaluation if hidden_states.ndim == 3: hidden_states = self.head(hidden_states, gather_dim=1) - else: # Training + else: # Training hidden_states = self.head(hidden_states, gather_dim=0) if not self.parallel_output: diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 1db98d7e..6c1e7d89 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -51,7 +51,6 @@ class _SeqAllToAll(torch.autograd.Function): @staticmethod def forward(ctx: Any, group: dist.ProcessGroup, input_: Tensor, scatter_idx: int, gather_idx: int) -> Tensor: - ctx.group = group ctx.scatter_idx = scatter_idx ctx.gather_idx = gather_idx @@ -91,7 +90,6 @@ def __init__( second_scatter_idx: int = 0, second_gather_idx: int = 1, ) -> None: - super().__init__() self.local_attn = local_attention self.spg = sequence_process_group @@ -178,6 +176,7 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, tp_mode: str = "origin_tp", + block_idx: int = 0, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -206,14 +205,23 @@ def __init__( # notice here should change bias=True Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear - self.Wqkv = Wqkv_cls( - embed_dim, - 3 * embed_dim, - process_group, - bias=False, - sequence_parallel=gpc.config.parallel.sequence_parallel, - **factory_kwargs, - ) # according to https://spaces.ac.cn/archives/9577 + if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight: + Wqkv_cls = nn.Linear + self.Wqkv = Wqkv_cls( + embed_dim, + 3 * embed_dim, + bias=False, + **factory_kwargs, + ) + else: + self.Wqkv = Wqkv_cls( + embed_dim, + 3 * embed_dim, + process_group, + bias=False, + sequence_parallel=gpc.config.parallel.sequence_parallel, + **factory_kwargs, + ) # according to https://spaces.ac.cn/archives/9577 inner_attn_cls = FlashSelfAttention if use_flash_attn else SelfAttention inner_cross_attn_cls = FlashCrossAttention if use_flash_attn else CrossAttention @@ -227,14 +235,23 @@ def __init__( # output projection always have the bias (for now) out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear - self.out_proj = out_proj_cls( - embed_dim, - embed_dim, - process_group, - bias=False, - sequence_parallel=gpc.config.parallel.sequence_parallel, - **factory_kwargs, - ) + if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight: + out_proj_cls = nn.Linear + self.out_proj = out_proj_cls( + embed_dim, + embed_dim, + bias=False, + **factory_kwargs, + ) + else: + self.out_proj = out_proj_cls( + embed_dim, + embed_dim, + process_group, + bias=False, + sequence_parallel=gpc.config.parallel.sequence_parallel, + **factory_kwargs, + ) # need to assign tp attribute so that internlm know it is tensor parallel module if gpc.get_world_size(ParallelMode.TENSOR) > 1: for name in ["out_proj", "Wqkv"]: diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 572adbad..24040a02 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -110,8 +110,8 @@ def initialize_model(): model = wrap_FSDP_model(model) if gpc.config.parallel["tensor"]["mode"] == "fstp": - # handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) - handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) + # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler return model @@ -396,6 +396,9 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): ) +tgs_list = [] + + @llm_timeout(func_name="record_current_batch_training_metrics") def record_current_batch_training_metrics( get_tflops_func, @@ -568,3 +571,9 @@ def record_current_batch_training_metrics( step_count=batch_count, cur_step_loss=loss.item(), ) + + if batch_count >= 5: + tgs_list.append(tgs_origin) + if batch_count == gpc.config.data.total_steps - 1: + print(tgs_list, flush=True) + print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True) From d1af0d6aee32a71385ef89983aef9ebb2417752c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 17 Oct 2023 10:13:56 +0800 Subject: [PATCH 024/153] feat(model/linear.py): block-grained backward --- configs/7B_sft.py | 9 ++--- internlm/model/linear.py | 77 +++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 98bceeb4..36f9ac14 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 8192 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 8 +NUM_LAYER = 4 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -57,7 +57,7 @@ # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=False, - total_steps=50000, + total_steps=20, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded @@ -161,10 +161,11 @@ sequence parallel (bool): enable/disable sequence parallel, defaults to False. """ parallel = dict( - zero1=dict(size=1, fsdp=False), - tensor=dict(size=8, mode='fstp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, mode="fstp"), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, + block_0_full_weight=True, ) cudnn_deterministic = False diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 8a17c719..8e19ab69 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -559,21 +559,21 @@ def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output): def _pre_backward_hook_for_block(block: nn.Module, grad_output): block_index = self.block_to_index[block] - if block_index == gpc.config.NUM_LAYER - 1: - # all gather weight for the last block - fsdp_modules = self.index_to_fsdp_modules[block_index] - for module in fsdp_modules: - total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handle.wait() - self.FSTP_global_weights[module] = total_weight - else: - # wait handle for current block - handles = self.block_handles[block] - for handle in handles: - handle.wait() + # if block_index == gpc.config.NUM_LAYER - 1: + # # all gather weight for the last block + # fsdp_modules = self.index_to_fsdp_modules[block_index] + # for module in fsdp_modules: + # total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) + # weight_handle.wait() + # self.FSTP_global_weights[module] = total_weight + # else: + # # wait handle for current block + # handles = self.block_handles[block] + # for handle in handles: + # handle.wait() # start the all-gather for next block - if block_index - 1 >= 0: + if block_index - 1 > 0: self._all_gather_block_weight(block_index - 1) def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): @@ -588,36 +588,41 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] if block_index != 0: - if name_index == 4: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - else: + # if name_index == 4: + # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + # weight_handler.wait() + # self.FSTP_global_weights[module] = total_weight + + # # start the all-gather for next module + # next_module = self.block_module[block_index][name_index - 1] + # self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + # next_module.weight, self.process_group, async_op=True + # ) + # self.FSTP_global_handle[next_module] = weights_handler + # else: + # handler = self.FSTP_global_handle[module] + # handler.wait() + # if name_index != 0: + # next_module = self.block_module[block_index][name_index - 1] + # self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + # next_module.weight, self.process_group, async_op=True + # ) + # self.FSTP_global_handle[next_module] = weights_handler + if module in self.FSTP_global_handle: handler = self.FSTP_global_handle[module] handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler def _post_backward_hook_for_module(module, grad_input, grad_output): if module in self.FSTP_global_weights: del self.FSTP_global_weights[module] + if module in self.FSTP_global_handle: + del self.FSTP_global_handle[module] - # for block in self.FSTP_blocks: - # block.register_forward_pre_hook(_pre_forward_hook_for_block) - # block.register_forward_hook(_post_forward_hook_for_block) - # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) - # block.register_full_backward_hook(_post_backward_hook_for_block) + for block in self.FSTP_blocks: + # block.register_forward_pre_hook(_pre_forward_hook_for_block) + # block.register_forward_hook(_post_forward_hook_for_block) + block.register_full_backward_pre_hook(_pre_backward_hook_for_block) + # block.register_full_backward_hook(_post_backward_hook_for_block) for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) From 229cc5c68c518734edfc01d36e6bd616d32a7224 Mon Sep 17 00:00:00 2001 From: "chenxun.p" Date: Tue, 17 Oct 2023 11:15:54 +0800 Subject: [PATCH 025/153] impl reduce scatter async --- .../core/scheduler/no_pipeline_scheduler.py | 1 + internlm/model/linear.py | 23 +++++++++-- internlm/model/utils.py | 28 ++++++++------ .../solver/optimizer/hybrid_zero_optim.py | 38 +++++++++++++++++-- 4 files changed, 71 insertions(+), 19 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 56661d8c..f0caf05c 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -194,6 +194,7 @@ def forward_backward_step( _output, _loss, _moe_loss = self._train_one_batch( _data, _label, engine, forward_only, return_loss, self._grad_accum_size ) + engine.optimizer.reset_reduce_bucket() if return_loss: loss += _loss diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 8e19ab69..b141829e 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -329,6 +329,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name + self.reduce_scatter_handlers = {} + # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): model = [model] @@ -337,16 +339,22 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non if isinstance(_chunk, NaiveAMPModel): _chunk = _chunk.model - for _, children in _chunk.named_children(): + for _chunk_name, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): for idx, block in enumerate(children): index = 0 self.block_module[idx] = {} - for _, sub in block.named_children(): + for _sub_name, sub in block.named_children(): sub_modules = list(sub.children()) if len(sub_modules) > 0: for name, child in sub.named_children(): if isinstance(child, FSTPLinear): + + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + self.FSTP_modules.append(child) self.module_block[child] = idx self.block_module[idx][index] = child @@ -450,6 +458,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} + self.reduce_scatter_handlers = {} + # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): model = [model] @@ -458,7 +468,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non if isinstance(_chunk, NaiveAMPModel): _chunk = _chunk.model - for _, children in _chunk.named_children(): + for _chunk_name, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): for idx, block in enumerate(children): index = 0 @@ -467,7 +477,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.block_to_index[block] = idx self.index_to_block[idx] = block self.index_to_fsdp_modules[idx] = [] - for _, sub in block.named_children(): + for _sub_name, sub in block.named_children(): sub_modules = list(sub.children()) if len(sub_modules) > 0: for name, child in sub.named_children(): @@ -485,6 +495,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.index_to_fsdp_modules[idx].append(child) self.module_name_index[child] = index index = index + 1 + + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") else: continue diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 97319d98..78ad456d 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -324,9 +324,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod raise RuntimeError("fused_dense only supports matrix dims <= 2M") output = F.linear(total_x, total_weight, total_bias) if ctx.compute_weight_gradient: - ctx.save_for_backward(x, weight) + ctx.save_for_backward(x, weight, bias) else: - ctx.save_for_backward(weight) + ctx.save_for_backward(weight, bias) return output if not return_residual else (output, x) @staticmethod @@ -340,10 +340,10 @@ def backward(ctx, grad_output, *args): all_gather_handler = ctx.all_gather_handler module = ctx.module if ctx.compute_weight_gradient: - x, weight = ctx.saved_tensors + x, weight, bias = ctx.saved_tensors total_x = x else: - (weight,) = ctx.saved_tensors + weight, bias = ctx.saved_tensors total_x = None batch_shape = grad_output.shape[:-1] batch_dim = batch_shape.numel() @@ -368,9 +368,15 @@ def backward(ctx, grad_output, *args): total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + assert hasattr(weight, "_fstp_reduce_scatter_str") + all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) + grad_weight = torch.empty(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device) if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + assert hasattr(bias, "_fstp_reduce_scatter_str") + all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) + grad_bias = torch.empty(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -384,11 +390,11 @@ def backward(ctx, grad_output, *args): else: grad_input = None - if ctx.needs_input_grad[1]: - if world_size > 1: - handle_grad_weight.wait() - if grad_bias is not None: - handle_grad_bias.wait() + # if ctx.needs_input_grad[1]: + # if world_size > 1: + # handle_grad_weight.wait() + # if grad_bias is not None: + # handle_grad_bias.wait() return grad_input, grad_weight, grad_bias, None, None, None, None diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 97004eb9..c6e9aaba 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -65,6 +65,8 @@ def __init__( hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale + self._fstp_handler = gpc.config.fstp_handler + # Zero related args reduce_bucket_size = zero_cfg.reduce_bucket_size clip_grad_norm = zero_cfg.clip_grad_norm @@ -301,8 +303,7 @@ def _define_and_attach(param, reduce_rank=None): # NOT IMPORTANT BUT GOOD TO KNOW: # args here is not grad, but allow_unreacable and accumulate_grad def reduce_grad_hook(*args): # pylint: disable=W0613 - if self.skip_grad_reduce is False: - reduction_func() + reduction_func() accum_grad_obj.register_hook(reduce_grad_hook) @@ -322,6 +323,20 @@ def belongs_to_current_rank(self, param) -> bool: group_id = getattr(param, "group_id") return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) + def reset_reduce_bucket(self) -> None: + for bucket in self._bucket_store: + for rank, params in bucket._params.items(): + for _param in params: + if not hasattr(_param, "_fstp_reduce_scatter_str"): + continue + + key = getattr(_param, "_fstp_reduce_scatter_str") + comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] + comm_handle.wait() + _param.grad = _grad + + bucket.reset_by_rank(rank) + def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): param_size = param.numel() @@ -332,11 +347,26 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): current_bucket = self._bucket_store[group_id] if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: - self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) + # wait reduce scatter communication + params = current_bucket.get_param(reduce_rank) + for _param in params: + if not hasattr(_param, "_fstp_reduce_scatter_str"): + continue + + key = getattr(_param, "_fstp_reduce_scatter_str") + comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] + comm_handle.wait() + _param.grad = _grad + + # reduce grad + if self.skip_grad_reduce is False: + self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) + else: + current_bucket.reset_by_rank(reduce_rank) # the param must not be reduced to ensure correctness is_param_reduced = self._param_store.is_param_reduced(param) - if is_param_reduced: + if is_param_reduced and self.skip_grad_reduce is False: msg = ( f"Parameter of size ({param.size()}) has already been reduced, " + "duplicate reduction will lead to arithmetic incorrectness" From 4e99a7fdbc88e398255d63a9b22854b5ded5deb3 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 17 Oct 2023 11:30:44 +0800 Subject: [PATCH 026/153] feat(train/training_internlm.py): remove abnormal tgs when calculating avg tgs --- internlm/train/training_internlm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 24040a02..cc310a21 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -576,4 +576,8 @@ def record_current_batch_training_metrics( tgs_list.append(tgs_origin) if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) + avg_tgs = sum(tgs_list) / len(tgs_list) + for tgs in tgs_list.copy(): + if abs(tgs - avg_tgs) > 1000: + tgs_list.remove(tgs) print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True) From 6682f5d92a02111777f5c1fbc8c0765c9770ffa2 Mon Sep 17 00:00:00 2001 From: "chenxun.p" Date: Tue, 17 Oct 2023 15:10:07 +0800 Subject: [PATCH 027/153] fix reduce scatter async bug --- internlm/model/utils.py | 4 ++-- internlm/solver/optimizer/hybrid_zero_optim.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 78ad456d..0194e84a 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -371,12 +371,12 @@ def backward(ctx, grad_output, *args): grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) assert hasattr(weight, "_fstp_reduce_scatter_str") all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) - grad_weight = torch.empty(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device) + grad_weight = torch.zeros(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device) if grad_bias is not None: grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) assert hasattr(bias, "_fstp_reduce_scatter_str") all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) - grad_bias = torch.empty(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device) + grad_bias = torch.zeros(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index c6e9aaba..950d35e8 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -333,7 +333,7 @@ def reset_reduce_bucket(self) -> None: key = getattr(_param, "_fstp_reduce_scatter_str") comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() - _param.grad = _grad + _param.grad += _grad bucket.reset_by_rank(rank) @@ -356,7 +356,7 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): key = getattr(_param, "_fstp_reduce_scatter_str") comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() - _param.grad = _grad + _param.grad += _grad # reduce grad if self.skip_grad_reduce is False: From 6408b944c2e6510253f3b5ca7e3680ed56a6b528 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 17 Oct 2023 15:14:39 +0800 Subject: [PATCH 028/153] support fine grained --- internlm/model/linear.py | 77 +++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 8e19ab69..e8727ac5 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -449,7 +449,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.index_to_fsdp_modules = dict() # key: transformer block index; value: fsdp modules self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} - + self.head = [] + # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): model = [model] @@ -487,16 +488,18 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non index = index + 1 else: continue + elif isinstance(children, ScaleColumnParallelLinear): + self.head.append(children) def _all_gather_block_weight(self, block_index: int): block = self.index_to_block[block_index] fsdp_modules = self.index_to_fsdp_modules[block_index] - self.block_handles[block] = [] + # self.block_handles[block] = [] for module in fsdp_modules: total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) self.FSTP_global_weights[module] = total_weight self.FSTP_global_handle[module] = weight_handle - self.block_handles[block].append(weight_handle) + # self.block_handles[block].append(weight_handle) def _register_sync_parameters_hook(self) -> None: """ @@ -558,6 +561,7 @@ def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output): self._all_gather_block_weight(block_index - 1) def _pre_backward_hook_for_block(block: nn.Module, grad_output): + # import pdb; pdb.set_trace() block_index = self.block_to_index[block] # if block_index == gpc.config.NUM_LAYER - 1: # # all gather weight for the last block @@ -571,10 +575,14 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output): # handles = self.block_handles[block] # for handle in handles: # handle.wait() - + # if block_index == gpc.config.NUM_LAYER - 1: + # self._all_gather_block_weight(block_index) # start the all-gather for next block if block_index - 1 > 0: self._all_gather_block_weight(block_index - 1) + + # def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): + # self._all_gather_block_weight(gpc.config.NUM_LAYER - 1) def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): block_index = self.block_to_index[block] @@ -588,45 +596,58 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] if block_index != 0: - # if name_index == 4: - # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - # weight_handler.wait() - # self.FSTP_global_weights[module] = total_weight - - # # start the all-gather for next module - # next_module = self.block_module[block_index][name_index - 1] - # self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - # next_module.weight, self.process_group, async_op=True - # ) - # self.FSTP_global_handle[next_module] = weights_handler - # else: - # handler = self.FSTP_global_handle[module] - # handler.wait() - # if name_index != 0: - # next_module = self.block_module[block_index][name_index - 1] - # self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - # next_module.weight, self.process_group, async_op=True - # ) - # self.FSTP_global_handle[next_module] = weights_handler - if module in self.FSTP_global_handle: + if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: + total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler.wait() + self.FSTP_global_weights[module] = total_weight + + # start the all-gather for next module + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + elif name_index == 0: + handler = self.FSTP_global_handle[module] + handler.wait() + + if block_index - 1 > 0: + next_module = self.block_module[block_index - 1][4] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + else: handler = self.FSTP_global_handle[module] handler.wait() + if name_index != 0: + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + # if module in self.FSTP_global_handle: + # handler = self.FSTP_global_handle[module] + # handler.wait() def _post_backward_hook_for_module(module, grad_input, grad_output): if module in self.FSTP_global_weights: del self.FSTP_global_weights[module] if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] + + # for head in self.head: + # head.register_full_backward_hook(_post_backward_hook_for_head) - for block in self.FSTP_blocks: + # for block in self.FSTP_blocks: # block.register_forward_pre_hook(_pre_forward_hook_for_block) # block.register_forward_hook(_post_forward_hook_for_block) - block.register_full_backward_pre_hook(_pre_backward_hook_for_block) + # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) # block.register_full_backward_hook(_post_backward_hook_for_block) for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) - + # for wqkv in self.FSTP_wqkvs: # wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) From 5c38cb64095513c3740e9618c41e143608169ab5 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 17 Oct 2023 15:38:24 +0800 Subject: [PATCH 029/153] add head overlap --- internlm/model/linear.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 16b0c85f..71bdf057 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -596,8 +596,11 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output): if block_index - 1 > 0: self._all_gather_block_weight(block_index - 1) - # def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): - # self._all_gather_block_weight(gpc.config.NUM_LAYER - 1) + def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): + first_module = self.block_module[gpc.config.NUM_LAYER - 1][4] + total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True) + self.FSTP_global_handle[first_module] = weight_handler + self.FSTP_global_weights[first_module] = total_weight def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): block_index = self.block_to_index[block] @@ -612,9 +615,10 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): name_index = self.module_name_index[module] if block_index != 0: if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler = self.FSTP_global_handle[module] weight_handler.wait() - self.FSTP_global_weights[module] = total_weight + # self.FSTP_global_weights[module] = total_weight # start the all-gather for next module next_module = self.block_module[block_index][name_index - 1] @@ -651,8 +655,8 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] - # for head in self.head: - # head.register_full_backward_hook(_post_backward_hook_for_head) + for head in self.head: + head.register_full_backward_hook(_post_backward_hook_for_head) # for block in self.FSTP_blocks: # block.register_forward_pre_hook(_pre_forward_hook_for_block) From 5abe519c4c9806ecce76b29dcd88f738c1014d67 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 17 Oct 2023 16:37:06 +0800 Subject: [PATCH 030/153] remove full weight for block 0 --- internlm/model/linear.py | 152 +++++++++++-------------- internlm/model/multi_head_attention.py | 50 +++----- 2 files changed, 85 insertions(+), 117 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 71bdf057..cc9524a1 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -12,6 +12,7 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel +from internlm.model.embedding import Embedding1D from internlm.model.utils import ( Silu, all_gather_raw, @@ -255,56 +256,33 @@ def __init__( hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - if block_idx == 0 and gpc.config.parallel.block_0_full_weight: - self.w1 = nn.Linear( - in_features, - hidden_features, - bias, - device=device, - dtype=dtype, - ) - self.w2 = nn.Linear( - in_features, - hidden_features, - bias, - device=device, - dtype=dtype, - ) - self.w3 = nn.Linear( - hidden_features, - out_features, - bias=bias, - device=device, - dtype=dtype, - ) - else: - self.w1 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w2 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w3 = FSTPLinear( - hidden_features, - out_features, - process_group, - bias=bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) + self.w1 = FSTPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w2 = FSTPLinear( + in_features, + hidden_features, + process_group, + bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) + self.w3 = FSTPLinear( + hidden_features, + out_features, + process_group, + bias=bias, + sequence_parallel=gpc.config.parallel.sequence_parallel, + device=device, + dtype=dtype, + ) def forward(self, x): w1_o = self.w1(x) @@ -458,6 +436,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} self.head = [] + self.embedding = [] self.reduce_scatter_handlers = {} @@ -505,6 +484,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non continue elif isinstance(children, ScaleColumnParallelLinear): self.head.append(children) + elif isinstance(children, Embedding1D): + self.embedding.append(children) def _all_gather_block_weight(self, block_index: int): block = self.index_to_block[block_index] @@ -532,7 +513,6 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # start the all-gather for next block if block_index + 1 < gpc.config.NUM_LAYER: self._all_gather_block_weight(block_index + 1) - # print(f"_all_gather_block_weight for block {block_index+1}", flush=True) def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): block_index = self.block_to_index[block] @@ -548,6 +528,10 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): handles = self.block_handles[block] for handle in handles: handle.wait() + + def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output): + self._all_gather_block_weight(0) + def _post_forward_hook_for_block(block: nn.Module, input, output): block_index = self.block_to_index[block] @@ -557,11 +541,10 @@ def _post_forward_hook_for_block(block: nn.Module, input, output): for module in fsdp_modules: del self.FSTP_global_weights[module] - def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any,): block_index = self.module_to_index[module] - if block_index != 0: - handler = self.FSTP_global_handle[module] - handler.wait() + handler = self.FSTP_global_handle[module] + handler.wait() def _post_forward_hook_for_module(module: nn.Module, input, output): if module in self.FSTP_global_weights: @@ -593,7 +576,7 @@ def _pre_backward_hook_for_block(block: nn.Module, grad_output): # if block_index == gpc.config.NUM_LAYER - 1: # self._all_gather_block_weight(block_index) # start the all-gather for next block - if block_index - 1 > 0: + if block_index - 1 >= 0: self._all_gather_block_weight(block_index - 1) def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): @@ -613,38 +596,38 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): def _pre_backward_hook_for_module(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] - if block_index != 0: - if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: - # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler = self.FSTP_global_handle[module] - weight_handler.wait() - # self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module + + if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: + # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler = self.FSTP_global_handle[module] + weight_handler.wait() + # self.FSTP_global_weights[module] = total_weight + + # start the all-gather for next module + next_module = self.block_module[block_index][name_index - 1] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + elif name_index == 0: + handler = self.FSTP_global_handle[module] + handler.wait() + + if block_index - 1 >= 0: + next_module = self.block_module[block_index - 1][4] + self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( + next_module.weight, self.process_group, async_op=True + ) + self.FSTP_global_handle[next_module] = weights_handler + else: + handler = self.FSTP_global_handle[module] + handler.wait() + if name_index != 0: next_module = self.block_module[block_index][name_index - 1] self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( next_module.weight, self.process_group, async_op=True ) self.FSTP_global_handle[next_module] = weights_handler - elif name_index == 0: - handler = self.FSTP_global_handle[module] - handler.wait() - - if block_index - 1 > 0: - next_module = self.block_module[block_index - 1][4] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - else: - handler = self.FSTP_global_handle[module] - handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler # if module in self.FSTP_global_handle: # handler = self.FSTP_global_handle[module] # handler.wait() @@ -655,6 +638,9 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] + for embedding in self.embedding: + embedding.register_forward_hook(_pre_forward_hook_for_embedding) + for head in self.head: head.register_full_backward_hook(_post_backward_hook_for_head) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 6c1e7d89..7a0f4ed7 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -205,23 +205,14 @@ def __init__( # notice here should change bias=True Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear - if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight: - Wqkv_cls = nn.Linear - self.Wqkv = Wqkv_cls( - embed_dim, - 3 * embed_dim, - bias=False, - **factory_kwargs, - ) - else: - self.Wqkv = Wqkv_cls( - embed_dim, - 3 * embed_dim, - process_group, - bias=False, - sequence_parallel=gpc.config.parallel.sequence_parallel, - **factory_kwargs, - ) # according to https://spaces.ac.cn/archives/9577 + self.Wqkv = Wqkv_cls( + embed_dim, + 3 * embed_dim, + process_group, + bias=False, + sequence_parallel=gpc.config.parallel.sequence_parallel, + **factory_kwargs, + ) # according to https://spaces.ac.cn/archives/9577 inner_attn_cls = FlashSelfAttention if use_flash_attn else SelfAttention inner_cross_attn_cls = FlashCrossAttention if use_flash_attn else CrossAttention @@ -235,23 +226,14 @@ def __init__( # output projection always have the bias (for now) out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear - if block_idx == 0 and tp_mode != "origin_tp" and gpc.config.parallel.block_0_full_weight: - out_proj_cls = nn.Linear - self.out_proj = out_proj_cls( - embed_dim, - embed_dim, - bias=False, - **factory_kwargs, - ) - else: - self.out_proj = out_proj_cls( - embed_dim, - embed_dim, - process_group, - bias=False, - sequence_parallel=gpc.config.parallel.sequence_parallel, - **factory_kwargs, - ) + self.out_proj = out_proj_cls( + embed_dim, + embed_dim, + process_group, + bias=False, + sequence_parallel=gpc.config.parallel.sequence_parallel, + **factory_kwargs, + ) # need to assign tp attribute so that internlm know it is tensor parallel module if gpc.get_world_size(ParallelMode.TENSOR) > 1: for name in ["out_proj", "Wqkv"]: From 16ef7b788915bcf222ae96fdc556f168c3c9c6b7 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 17 Oct 2023 17:16:39 +0800 Subject: [PATCH 031/153] add test --- configs/13B_sft.py | 180 ++++++++++++++++++ configs/20B_sft.py | 180 ++++++++++++++++++ configs/30B_sft.py | 180 ++++++++++++++++++ configs/7B_sft.py | 7 +- .../solver/optimizer/hybrid_zero_optim.py | 5 +- internlm/train/training_internlm.py | 2 +- 6 files changed, 547 insertions(+), 7 deletions(-) create mode 100644 configs/13B_sft.py create mode 100644 configs/20B_sft.py create mode 100644 configs/30B_sft.py diff --git a/configs/13B_sft.py b/configs/13B_sft.py new file mode 100644 index 00000000..e3e17ae0 --- /dev/null +++ b/configs/13B_sft.py @@ -0,0 +1,180 @@ +JOB_NAME = "13b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, mode="origin_tp"), + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=True, +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/20B_sft.py b/configs/20B_sft.py new file mode 100644 index 00000000..1d093efc --- /dev/null +++ b/configs/20B_sft.py @@ -0,0 +1,180 @@ +JOB_NAME = "13b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=4, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, mode="fstp"), + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=True, +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_sft.py b/configs/30B_sft.py new file mode 100644 index 00000000..5ac67451 --- /dev/null +++ b/configs/30B_sft.py @@ -0,0 +1,180 @@ +JOB_NAME = "13b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=4, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, mode="fstp"), + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=True, +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 36f9ac14..106548a2 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -2,10 +2,10 @@ DO_ALERT = False SEQ_LEN = 4096 -HIDDEN_SIZE = 8192 +HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 -NUM_LAYER = 4 +NUM_LAYER = 32 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -51,7 +51,7 @@ # micro_num means the number of micro_batch contained in one gradient update micro_num=4, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, + micro_bsz=4, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -165,7 +165,6 @@ tensor=dict(size=8, mode="fstp"), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, - block_0_full_weight=True, ) cudnn_deterministic = False diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 950d35e8..c7c10071 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -64,8 +64,9 @@ def __init__( backoff_factor = grad_scal_cfg.backoff_factor hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - - self._fstp_handler = gpc.config.fstp_handler + + if gpc.config.parallel["tensor"]["mode"] == "fstp": + self._fstp_handler = gpc.config.fstp_handler # Zero related args reduce_bucket_size = zero_cfg.reduce_bucket_size diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index cc310a21..93903a38 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -578,6 +578,6 @@ def record_current_batch_training_metrics( print(tgs_list, flush=True) avg_tgs = sum(tgs_list) / len(tgs_list) for tgs in tgs_list.copy(): - if abs(tgs - avg_tgs) > 1000: + if abs(tgs - avg_tgs) > 400: tgs_list.remove(tgs) print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True) From a5aeab2a3f06c7b07e302f911c2bd6ae2a69362e Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 17 Oct 2023 19:54:21 +0800 Subject: [PATCH 032/153] memory profiling test --- configs/20B_sft.py | 4 +-- internlm/model/linear.py | 12 +------- .../solver/optimizer/hybrid_zero_optim.py | 29 +++++++++++++++---- train.py | 2 ++ 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/configs/20B_sft.py b/configs/20B_sft.py index 1d093efc..bc63d346 100644 --- a/configs/20B_sft.py +++ b/configs/20B_sft.py @@ -1,4 +1,4 @@ -JOB_NAME = "13b_train" +JOB_NAME = "20b_train" DO_ALERT = False SEQ_LEN = 4096 @@ -51,7 +51,7 @@ # micro_num means the number of micro_batch contained in one gradient update micro_num=4, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=4, + micro_bsz=2, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate diff --git a/internlm/model/linear.py b/internlm/model/linear.py index cc9524a1..0ea6ee30 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -423,7 +423,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.process_group = process_group self.FSTP_blocks = [] self.FSTP_outs = [] - self.FSTP_wqkvs = [] self.FSTP_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] self.FSTP_global_handle = dict() # key: FSTP module; value: module global all-gather op handle @@ -465,9 +464,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non if name == "out_proj": self.FSTP_outs.append(child) self.module_to_index[child] = idx - if name == "Wqkv": - self.FSTP_wqkvs.append(child) - self.module_to_index[child] = idx if isinstance(child, FSTPLinear): self.module_to_index[child] = idx self.block_module[idx][index] = child @@ -488,7 +484,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.embedding.append(children) def _all_gather_block_weight(self, block_index: int): - block = self.index_to_block[block_index] + #block = self.index_to_block[block_index] fsdp_modules = self.index_to_fsdp_modules[block_index] # self.block_handles[block] = [] for module in fsdp_modules: @@ -552,12 +548,6 @@ def _post_forward_hook_for_module(module: nn.Module, input, output): if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] - def _pre_backward_hook_for_wqkv(module: nn.Module, grad_output): - block_index = self.module_to_index[module] - # start the all-gather for next block - if block_index - 1 >= 0: - self._all_gather_block_weight(block_index - 1) - def _pre_backward_hook_for_block(block: nn.Module, grad_output): # import pdb; pdb.set_trace() block_index = self.block_to_index[block] diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index c7c10071..d2268274 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -39,6 +39,14 @@ inf = math.inf logger = get_logger(__file__) +def print_memory(msg): + + if gpc.get_global_rank() == 0: + print(msg, flush=True) + print("memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, flush=True) + print("max memory allocated: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) + print("===========================================") + class HybridZeroOptimizer(BaseOptimizer): """ @@ -335,6 +343,7 @@ def reset_reduce_bucket(self) -> None: comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() _param.grad += _grad + self._fstp_handler.reduce_scatter_handlers[key] = None bucket.reset_by_rank(rank) @@ -358,6 +367,7 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() _param.grad += _grad + self._fstp_handler.reduce_scatter_handlers[key] = None # reduce grad if self.skip_grad_reduce is False: @@ -565,6 +575,7 @@ def step(self, closure=None): # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients + print_memory("No 1") if not self._overlap_sync_grad: for group_id in range(len(self._fp16_param_groups)): for param in self._fp16_param_groups[group_id]: @@ -589,7 +600,7 @@ def step(self, closure=None): bucket.empty() self._bucket_in_progress = [] self._param_store.clear_grads_of_previous_reduced_params() - + print_memory("No 2") # compute norm for gradients in the last bucket total_norms = {} for group_id in range(self.num_param_groups): @@ -611,10 +622,12 @@ def step(self, closure=None): scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float) dist.all_reduce(scaled_norm_tensor, group=pg) total_norms[group_name] = scaled_norm_tensor.item() - + print_memory("No 3") timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() + + print_memory("No 4") return self._step(closure=closure, norms=total_norms) @@ -661,7 +674,7 @@ def _step(self, closure=None, norms=None): self._grad_store._averaged_gradients = dict() self.zero_grad() return False, norms - + print_memory("No 5") # copy the grad of fp16 param to fp32 param single_grad_partition_groups = [] for group_id in range(self.num_param_groups): @@ -702,7 +715,7 @@ def _step(self, closure=None, norms=None): single_grad_partition_groups.append(flat_fp32_avg_grads) device = self._fp32_flat_param_groups_of_current_rank[group_id].device self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device) - + print_memory("No 6") # unscale and clip grads # get the global norm global_norm_groups = {} @@ -725,9 +738,12 @@ def _step(self, closure=None, norms=None): # For those ranks that are not assigned parameters, we just wait for other ranks # to send them updated their own parameters. if self.has_params: + print_memory("No 7") self.optim.step() + print_memory("No 8") # release the fp32 grad release_param_grad(self._fp32_flat_param_groups_of_current_rank.values()) + print_memory("No 9") # update fp16 partition updated by the current rank for group_id in range(len(self._fp16_param_groups)): if self.param_group_has_params[group_id]: @@ -736,17 +752,18 @@ def _step(self, closure=None, norms=None): ) fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] fp16_param.data.copy_(fp32_param) - + print_memory("No 10") torch.cuda.synchronize() with torch.cuda.stream(self._comm_bcast_stream): self.broadcast_params() - + timer("step").stop() # update gradients may not be needed here, because the sync_params function is used in initialization, # so synchronization is maintained for group_name, global_norm in global_norm_groups.items(): global_norm_groups[group_name] = global_norm / loss_scale + print_memory("No 11") return True, global_norm_groups def broadcast_params(self): diff --git a/train.py b/train.py index 139bac1f..0a84f592 100644 --- a/train.py +++ b/train.py @@ -296,6 +296,8 @@ def main(args): if batch_count % 2 == 0: prof.step() + + torch.cuda.reset_peak_memory_stats() ckpt_manager.wait_async_upload_finish() From 47422711545e9332708be2ad26b64996bb9c1447 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 19 Oct 2023 13:21:33 +0800 Subject: [PATCH 033/153] add memory pool --- configs/20B_sft.py | 4 +- configs/30B_sft.py | 4 +- internlm/model/linear.py | 83 +++++++++++++- internlm/model/modeling_internlm.py | 1 - internlm/model/utils.py | 98 +++++++++++----- .../solver/optimizer/hybrid_zero_optim.py | 108 ++++++++++++++---- internlm/solver/optimizer/store.py | 3 + internlm/train/training_internlm.py | 36 +++++- internlm/utils/gputest.py | 15 ++- train.py | 7 +- 10 files changed, 295 insertions(+), 64 deletions(-) diff --git a/configs/20B_sft.py b/configs/20B_sft.py index bc63d346..5a9021be 100644 --- a/configs/20B_sft.py +++ b/configs/20B_sft.py @@ -57,7 +57,7 @@ # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=False, - total_steps=20, + total_steps=50, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp"), + tensor=dict(size=8, mode="fstp", overlap=True), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/configs/30B_sft.py b/configs/30B_sft.py index 5ac67451..ec040480 100644 --- a/configs/30B_sft.py +++ b/configs/30B_sft.py @@ -1,4 +1,4 @@ -JOB_NAME = "13b_train" +JOB_NAME = "30b_train" DO_ALERT = False SEQ_LEN = 4096 @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp"), + tensor=dict(size=8, mode="origin_tp", overlap=False), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 0ea6ee30..4f05cd32 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -16,6 +16,7 @@ from internlm.model.utils import ( Silu, all_gather_raw, + all_gather_raw_memory_pool, fstp_fused_dense_func, fused_dense_func_torch, ) @@ -219,8 +220,12 @@ def forward(self, x): class FSTPLinear(ColumnParallelLinear): def forward(self, x): + block_index = gpc.config.fstp_handler.module_to_index[self] + name_index = gpc.config.fstp_handler.module_name_index[self] + name = gpc.config.fstp_handler.module_name[name_index] return fstp_fused_dense_func( - x, self.weight, self.bias, process_group=self.process_group, module=self, handler=gpc.config.fstp_handler + x, self.weight, self.bias, process_group=self.process_group, + module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name ) @@ -308,6 +313,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name self.reduce_scatter_handlers = {} + self.all_reduce_handlers = {} # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): @@ -438,6 +444,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.embedding = [] self.reduce_scatter_handlers = {} + self.all_reduce_handlers = {} + self.zero_const_pool = {} # just want to share same for loop for ModuleList and Module if not isinstance(model, nn.ModuleList): @@ -476,12 +484,23 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") if child.bias is not None: setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + # _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + # setattr(child.weight, "_fstp_all_reduce_str", f"{_full_name}.weight") + # if child.bias is not None: + # setattr(child.bias, "_fstp_all_reduce_str", f"{_full_name}.bias") else: continue elif isinstance(children, ScaleColumnParallelLinear): self.head.append(children) elif isinstance(children, Embedding1D): self.embedding.append(children) + + def get_zero_by_shape(self, size:tuple, dtype, device) -> torch.Tensor: + if size not in self.zero_const_pool: + self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() + + return self.zero_const_pool[size] + def _all_gather_block_weight(self, block_index: int): #block = self.index_to_block[block_index] @@ -492,6 +511,17 @@ def _all_gather_block_weight(self, block_index: int): self.FSTP_global_weights[module] = total_weight self.FSTP_global_handle[module] = weight_handle # self.block_handles[block].append(weight_handle) + + def _all_gather_block_weight_memory_pool(self, block_index: int): + fsdp_modules = self.index_to_fsdp_modules[block_index] + # self.block_handles[block] = [] + for module in fsdp_modules: + module_index = self.module_name_index[module] + name = self.module_name[module_index] + weight_handle = all_gather_raw_memory_pool(module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name) + # self.FSTP_global_weights[module] = total_weight + self.FSTP_global_handle[module] = weight_handle + # self.block_handles[block].append(weight_handle) def _register_sync_parameters_hook(self) -> None: """ @@ -508,7 +538,8 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): block_index = self.module_to_index[module] # start the all-gather for next block if block_index + 1 < gpc.config.NUM_LAYER: - self._all_gather_block_weight(block_index + 1) + # self._all_gather_block_weight(block_index + 1) + self._all_gather_block_weight_memory_pool(block_index + 1) def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): block_index = self.block_to_index[block] @@ -526,7 +557,8 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): handle.wait() def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output): - self._all_gather_block_weight(0) + # self._all_gather_block_weight(0) + self._all_gather_block_weight_memory_pool(0) def _post_forward_hook_for_block(block: nn.Module, input, output): @@ -583,6 +615,48 @@ def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): for module in fsdp_modules: del self.FSTP_global_weights[module] + def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output): + block_index = self.module_to_index[module] + name_index = self.module_name_index[module] + + if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: + # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) + weight_handler = self.FSTP_global_handle[module] + weight_handler.wait() + # self.FSTP_global_weights[module] = total_weight + + # start the all-gather for next module + next_module = self.block_module[block_index][name_index - 1] + next_name = self.module_name[name_index - 1] + weights_handler = all_gather_raw_memory_pool( + next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=next_name + ) + self.FSTP_global_handle[next_module] = weights_handler + elif name_index == 0: + handler = self.FSTP_global_handle[module] + handler.wait() + + if block_index - 1 >= 0: + next_module = self.block_module[block_index - 1][4] + name = self.module_name[4] + weights_handler = all_gather_raw_memory_pool( + next_module.weight, self.process_group, async_op=True, block_index=block_index - 1, module_name=name, + ) + self.FSTP_global_handle[next_module] = weights_handler + else: + handler = self.FSTP_global_handle[module] + handler.wait() + if name_index != 0: + next_module = self.block_module[block_index][name_index - 1] + name = self.module_name[name_index - 1] + weights_handler = all_gather_raw_memory_pool( + next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name + ) + self.FSTP_global_handle[next_module] = weights_handler + # if module in self.FSTP_global_handle: + # handler = self.FSTP_global_handle[module] + # handler.wait() + def _pre_backward_hook_for_module(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] @@ -649,5 +723,6 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): for module in self.FSTP_modules: module.register_forward_pre_hook(_pre_forward_hook_for_module) module.register_forward_hook(_post_forward_hook_for_module) - module.register_full_backward_pre_hook(_pre_backward_hook_for_module) + # module.register_full_backward_pre_hook(_pre_backward_hook_for_module) + module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool) module.register_full_backward_hook(_post_backward_hook_for_module) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index cb933960..b004dffa 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -125,7 +125,6 @@ def __init__( bias=False, device=device, dtype=dtype, - block_idx=block_idx, ) else: self.mlp = ParallelFusedMLP( diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 0194e84a..5b4018c8 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -6,7 +6,7 @@ import fused_dense_lib as fused_dense_cuda import torch import torch.nn.functional as F -from flash_attn.utils.distributed import all_reduce_raw, reduce_scatter_raw +from flash_attn.utils.distributed import all_reduce_raw #, reduce_scatter_raw from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup @@ -124,6 +124,12 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = ) return output, handle +def all_gather_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0, block_index: int = None, module_name: str = None): + handle = torch.distributed.all_gather_into_tensor( + gpc.config.block_memory[block_index % 2][module_name], input_.contiguous(), group=process_group, async_op=async_op + ) + return handle + def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): assert my_input.dtype == grad_output.dtype @@ -132,6 +138,17 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): return grad_weight, grad_bias +def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): + world_size = torch.distributed.get_world_size(process_group) + assert input_.shape[0] % world_size == 0 + output = torch.empty(input_.shape[0] // world_size, *input_.shape[1:], + dtype=input_.dtype, device=input_.device).contiguous() + handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(), + group=process_group, + async_op=async_op) + return output, handle + + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): "tp fused dense function" @@ -283,12 +300,14 @@ class FSTPFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None): + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group ctx.all_gather_handler = all_gather_handler ctx.module = module + ctx.block_index = block_index + ctx.module_name = module_name if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -297,8 +316,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all_gather for weight and bias before actual computation - if module in all_gather_handler.FSTP_global_weights: - total_weight = all_gather_handler.FSTP_global_weights[module] + if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights: + # total_weight = all_gather_handler.FSTP_global_weights[module] + total_weight = gpc.config.block_memory[block_index % 2][module_name] else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -323,6 +343,8 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod if min(batch_dim, n, *total_weight.shape) > 65535 * 32: raise RuntimeError("fused_dense only supports matrix dims <= 2M") output = F.linear(total_x, total_weight, total_bias) + del total_weight + del total_bias if ctx.compute_weight_gradient: ctx.save_for_backward(x, weight, bias) else: @@ -339,6 +361,9 @@ def backward(ctx, grad_output, *args): process_group = ctx.process_group all_gather_handler = ctx.all_gather_handler module = ctx.module + block_index = ctx.block_index + module_name = ctx.module_name + if ctx.compute_weight_gradient: x, weight, bias = ctx.saved_tensors total_x = x @@ -351,12 +376,13 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: - # do all-gather for weight before backward - if module in all_gather_handler.FSTP_global_weights: - total_weight = all_gather_handler.FSTP_global_weights[module] - else: - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() + total_weight = gpc.config.block_memory[block_index % 2][module_name] + # # do all-gather for weight before backward + # if module in all_gather_handler.FSTP_global_weights: + # total_weight = all_gather_handler.FSTP_global_weights[module] + # else: + # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + # handle_weight.wait() else: total_weight = weight @@ -368,15 +394,32 @@ def backward(ctx, grad_output, *args): total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) - assert hasattr(weight, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) - grad_weight = torch.zeros(grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:], dtype=grad_weight.dtype, device=grad_weight.device) - if grad_bias is not None: - grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - assert hasattr(bias, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) - grad_bias = torch.zeros(grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:], dtype=grad_bias.dtype, device=grad_bias.device) + if gpc.config.fstp_handler is not None: + # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True) + # assert hasattr(weight, "_fstp_all_reduce_str") + # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async) + # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) + # if grad_bias is not None: + # grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True) + # assert hasattr(bias, "_fstp_all_reduce_str") + # all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async) + # grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) + grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + assert hasattr(weight, "_fstp_reduce_scatter_str") + all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) + grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) + if grad_bias is not None: + grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + assert hasattr(bias, "_fstp_reduce_scatter_str") + all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) + grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) + else: + grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + if grad_bias is not None: + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + # if grad_bias is not None: + # grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -389,13 +432,14 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) else: grad_input = None + del total_weight - # if ctx.needs_input_grad[1]: - # if world_size > 1: - # handle_grad_weight.wait() - # if grad_bias is not None: - # handle_grad_bias.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None + if ctx.needs_input_grad[1]: + if world_size > 1 and gpc.config.fstp_handler is None: + handle_grad_weight.wait() + if grad_bias is not None: + handle_grad_bias.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None, None, None def fused_dense_func_torch( @@ -424,12 +468,14 @@ def fstp_fused_dense_func( process_group=None, module=None, handler=None, + block_index=None, + module_name=None, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler) + return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler, block_index, module_name) else: assert process_group is None out = F.linear(x, weight, bias) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index d2268274..d0cdd101 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -10,6 +10,7 @@ from internlm.core.context import Config, ParallelMode from internlm.core.context import global_context as gpc +from internlm.model.utils import split_forward_gather_backward from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -40,12 +41,8 @@ logger = get_logger(__file__) def print_memory(msg): - - if gpc.get_global_rank() == 0: - print(msg, flush=True) - print("memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, flush=True) - print("max memory allocated: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) - print("===========================================") + print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) + print("===========================================") class HybridZeroOptimizer(BaseOptimizer): @@ -73,7 +70,7 @@ def __init__( hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - if gpc.config.parallel["tensor"]["mode"] == "fstp": + if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: self._fstp_handler = gpc.config.fstp_handler # Zero related args @@ -94,6 +91,7 @@ def __init__( self._param_store = ParameterStore(ParallelMode.ZERO1) self._grad_store = GradientStore(ParallelMode.DATA) self._bucket_store = [] + self._bucket_store_2 = [] self._bucket_in_progress = [] # fp16 and fp32 params for mixed precision training @@ -162,6 +160,7 @@ def __init__( # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name self._broadcast_parallel_mode.append(zero_mode) self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"])) + self._bucket_store_2.append(BucketStore(group_id, param_group["dp_mode"])) # assign parameters to ranks the params in the list are sorted params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) @@ -307,12 +306,22 @@ def _define_and_attach(param, reduce_rank=None): param=param, reduce_rank=reduce_rank, ) + + reduce_scatter_checker = partial( + self._wait_reduce_scatter_and_accumulate_grad, + param=param, + reduce_rank=reduce_rank, + ) # define hook # NOT IMPORTANT BUT GOOD TO KNOW: # args here is not grad, but allow_unreacable and accumulate_grad def reduce_grad_hook(*args): # pylint: disable=W0613 - reduction_func() + if gpc.config.fstp_handler is not None: + reduce_scatter_checker() + + if self.skip_grad_reduce is False: + reduction_func() accum_grad_obj.register_hook(reduce_grad_hook) @@ -333,7 +342,7 @@ def belongs_to_current_rank(self, param) -> bool: return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) def reset_reduce_bucket(self) -> None: - for bucket in self._bucket_store: + for bucket in self._bucket_store_2: for rank, params in bucket._params.items(): for _param in params: if not hasattr(_param, "_fstp_reduce_scatter_str"): @@ -342,21 +351,39 @@ def reset_reduce_bucket(self) -> None: key = getattr(_param, "_fstp_reduce_scatter_str") comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() - _param.grad += _grad + _param.grad.add_(_grad) + # self._fstp_handler.reduce_scatter_handlers[key] = None + del _grad + del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None + assert key in self._fstp_handler.reduce_scatter_handlers + # if not hasattr(_param, "_fstp_all_reduce_str"): + # continue + + # key = getattr(_param, "_fstp_all_reduce_str") + # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] + # comm_handle.wait() + # with torch.no_grad(): + # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) + # _param.grad.add_(_grad) + # # self._fstp_handler.reduce_scatter_handlers[key] = None + # del _grad + # del self._fstp_handler.all_reduce_handlers[key] + # self._fstp_handler.all_reduce_handlers[key] = None + # assert key in self._fstp_handler.all_reduce_handlers bucket.reset_by_rank(rank) - - def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): + + def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): param_size = param.numel() # check if the bucket is full # if full, will reduce the grads already in the bucket # after reduction, the bucket will be empty group_id = getattr(param, "group_id") - current_bucket = self._bucket_store[group_id] + current_bucket = self._bucket_store_2[group_id] - if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: + if current_bucket.num_elements_in_bucket(reduce_rank) >= 512 * 1024 * 1024: # wait reduce scatter communication params = current_bucket.get_param(reduce_rank) for _param in params: @@ -366,18 +393,48 @@ def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): key = getattr(_param, "_fstp_reduce_scatter_str") comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] comm_handle.wait() - _param.grad += _grad + _param.grad.add_(_grad) + # self._fstp_handler.reduce_scatter_handlers[key] = None + del _grad + del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None + assert key in self._fstp_handler.reduce_scatter_handlers + + # if not hasattr(_param, "_fstp_all_reduce_str"): + # continue + + # key = getattr(_param, "_fstp_all_reduce_str") + # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] + # comm_handle.wait() + # with torch.no_grad(): + # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) + # _param.grad.add_(_grad) + # # self._fstp_handler.reduce_scatter_handlers[key] = None + # del _grad + # del self._fstp_handler.all_reduce_handlers[key] + # self._fstp_handler.all_reduce_handlers[key] = None + # assert key in self._fstp_handler.all_reduce_handlers - # reduce grad - if self.skip_grad_reduce is False: - self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) - else: current_bucket.reset_by_rank(reduce_rank) + + current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) + current_bucket.add_param(param, reduce_rank) + + def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): + param_size = param.numel() + + # check if the bucket is full + # if full, will reduce the grads already in the bucket + # after reduction, the bucket will be empty + group_id = getattr(param, "group_id") + current_bucket = self._bucket_store[group_id] + + if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: + self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) # the param must not be reduced to ensure correctness is_param_reduced = self._param_store.is_param_reduced(param) - if is_param_reduced and self.skip_grad_reduce is False: + if is_param_reduced: msg = ( f"Parameter of size ({param.size()}) has already been reduced, " + "duplicate reduction will lead to arithmetic incorrectness" @@ -628,8 +685,15 @@ def step(self, closure=None): timer("sync_grad").stop() print_memory("No 4") - - return self._step(closure=closure, norms=total_norms) + + try: + res = self._step(closure=closure, norms=total_norms) + except torch.cuda.OutOfMemoryError as e: + print(e, flush=True) + print(torch.cuda.memory_summary(), flush=True) + torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + + return res def _step(self, closure=None, norms=None): assert closure is None, "closure is not supported by step()" diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py index 33380eb4..228045ed 100644 --- a/internlm/solver/optimizer/store.py +++ b/internlm/solver/optimizer/store.py @@ -45,6 +45,9 @@ def __init__(self, group_id, dp_parallel_mode): def num_elements_in_bucket(self, reduce_rank: int = None): return self._num_elements_in_bucket[reduce_rank] + + def num_params_in_bucket(self, reduce_rank: int = None): + return len(self._params[reduce_rank]) def get_param_group_id(self): return self._group_id diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 93903a38..f39e3845 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -108,12 +108,45 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) + + gpc.config.fstp_handler = None - if gpc.config.parallel["tensor"]["mode"] == "fstp": + if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler + + # allocate memory pool + block_memory = {} # containing two groups of block weight + hidden_size = gpc.config.HIDDEN_SIZE + mlp_ratio = gpc.config.MLP_RATIO + mlp_hidden_size = int(hidden_size * mlp_ratio) + mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) + size_key = [(3 * hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (hidden_size, hidden_size)] + module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3'] + for i in range(2): + weight = {} + for name in module_name: + if name == 'Wqkv': + weight[name] = torch.zeros((3 * hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device='cuda').contiguous() + elif name == 'out_proj': + weight[name] = torch.zeros((hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device='cuda').contiguous() + elif name == 'w1' or name == 'w2': + weight[name] = torch.zeros((mlp_hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device='cuda').contiguous() + else: + weight[name] = torch.zeros((hidden_size, mlp_hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device='cuda').contiguous() + block_memory[i] = weight + gpc.config.block_memory = block_memory + return model @@ -393,6 +426,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): ), with_stack=True, with_modules=True, + profile_memory=True, ) diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 48877b90..52d96385 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -36,10 +36,17 @@ def empty_cache_and_diag(batch_count, interval=50): if batch_count > 0: if gpc.is_rank_for_log(): logger.info("Empty Cache and Diagnosis GPU/NCCL/Timer ...") - with torch.no_grad(): - timer_diagnosis() - bench_gpu() - bench_net() + # with torch.no_grad(): + # try: + # timer_diagnosis() + # bench_gpu() + # bench_net() + # except torch.distributed.DistBackendError as e: + # # import time + # # time.sleep(10) + # print(e, "rank = ", gpc.get_global_rank(), flush=True) + # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + # do empty_cache after the bench torch.cuda.empty_cache() # do garbage collection diff --git a/train.py b/train.py index 0a84f592..c972bea9 100644 --- a/train.py +++ b/train.py @@ -195,6 +195,7 @@ def main(args): # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps): empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval) + torch.cuda.memory._record_memory_history() start_time = time.time() timer("one-batch").start() @@ -294,9 +295,11 @@ def main(args): if memory_profiler is not None: memory_profiler.step() - if batch_count % 2 == 0: - prof.step() + prof.step() + if gpc.config.fstp_handler is not None: + gpc.config.fstp_handler.zero_const_pool = {} + torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() ckpt_manager.wait_async_upload_finish() From ed7232777a0214d7ee605872477eea3e25521c53 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Fri, 20 Oct 2023 10:35:45 +0800 Subject: [PATCH 034/153] support reduce scatter memory pool --- configs/20B_sft.py | 2 +- configs/30B_sft.py | 8 +-- configs/7B_sft.py | 2 +- internlm/model/utils.py | 52 ++++++++++++++++++- .../solver/optimizer/hybrid_zero_optim.py | 8 +-- internlm/train/training_internlm.py | 18 ++++--- train.py | 1 + 7 files changed, 74 insertions(+), 17 deletions(-) diff --git a/configs/20B_sft.py b/configs/20B_sft.py index 5a9021be..13e68b22 100644 --- a/configs/20B_sft.py +++ b/configs/20B_sft.py @@ -57,7 +57,7 @@ # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=False, - total_steps=50, + total_steps=20, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded diff --git a/configs/30B_sft.py b/configs/30B_sft.py index ec040480..8bde0571 100644 --- a/configs/30B_sft.py +++ b/configs/30B_sft.py @@ -5,7 +5,7 @@ HIDDEN_SIZE = 6144 NUM_ATTENTION_HEAD = 48 MLP_RATIO = 8 / 3 -NUM_LAYER = 40 +NUM_LAYER = 60 VOCAB_SIZE = 103168 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" @@ -51,7 +51,7 @@ # micro_num means the number of micro_batch contained in one gradient update micro_num=4, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=4, + micro_bsz=2, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -161,8 +161,8 @@ sequence parallel (bool): enable/disable sequence parallel, defaults to False. """ parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="origin_tp", overlap=False), + zero1=dict(size=4, fsdp=False), + tensor=dict(size=8, mode="fstp", overlap=True), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 106548a2..6ea8b96e 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp"), + tensor=dict(size=8, mode="fstp", overlap=True), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 5b4018c8..2667efed 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -14,6 +14,7 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger +from internlm.utils.common import get_current_device logger = get_logger(__file__) @@ -148,6 +149,18 @@ def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bo async_op=async_op) return output, handle +def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): + world_size = torch.distributed.get_world_size(process_group) + assert input_.shape[0] % world_size == 0 + size = (input_.shape[0] // world_size, *input_.shape[1:]) + index = check_reduce_scatter_memory_pool(size) + output = gpc.config.reduce_scatter_memory[size]['data'][index] + setattr(output, "index", index) + handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(), + group=process_group, + async_op=async_op) + return output, handle + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): @@ -404,12 +417,13 @@ def backward(ctx, grad_output, *args): # assert hasattr(bias, "_fstp_all_reduce_str") # all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async) # grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) - grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + + grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True) assert hasattr(weight, "_fstp_reduce_scatter_str") all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) if grad_bias is not None: - grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True) assert hasattr(bias, "_fstp_reduce_scatter_str") all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) @@ -521,3 +535,37 @@ def Silu(w1_o, w2_o): Silu = torch.jit.script(Silu) + +def check_reduce_scatter_memory_pool(key): + + return_idx = 0 + + # if key not in dict + if key not in gpc.config.reduce_scatter_memory: + gpc.config.reduce_scatter_memory[key] = {'data': [], 'used': []} + + # if the data is empty + if len(gpc.config.reduce_scatter_memory[key]['data']) == 0: + gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device()).contiguous()) + gpc.config.reduce_scatter_memory[key]['used'].append(True) + return_idx = 0 + return return_idx + else: # if not empty + for index, used in enumerate(gpc.config.reduce_scatter_memory[key]['used']): + if used == False: + gpc.config.reduce_scatter_memory[key]['used'][index] = True + return_idx = index + return return_idx + # if the memory pool is all used + length = len(gpc.config.reduce_scatter_memory[key]['data']) + gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device()).contiguous()) + gpc.config.reduce_scatter_memory[key]['used'].append(True) + return_idx = length + return return_idx + +def release_reduce_scatter_memory_pool(size, index): + gpc.config.reduce_scatter_memory[size]['used'][index] = False \ No newline at end of file diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index d0cdd101..96a54c01 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -10,7 +10,7 @@ from internlm.core.context import Config, ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import split_forward_gather_backward +from internlm.model.utils import split_forward_gather_backward, release_reduce_scatter_memory_pool from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -353,7 +353,8 @@ def reset_reduce_bucket(self) -> None: comm_handle.wait() _param.grad.add_(_grad) # self._fstp_handler.reduce_scatter_handlers[key] = None - del _grad + # del _grad + release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers @@ -395,7 +396,8 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): comm_handle.wait() _param.grad.add_(_grad) # self._fstp_handler.reduce_scatter_handlers[key] = None - del _grad + # del _grad + release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index f39e3845..2816da0e 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -51,7 +51,7 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile +from internlm.utils.common import DummyProfile, get_current_device from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp @@ -123,7 +123,8 @@ def initialize_model(): mlp_ratio = gpc.config.MLP_RATIO mlp_hidden_size = int(hidden_size * mlp_ratio) mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) - size_key = [(3 * hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (hidden_size, hidden_size)] + world_size = gpc.get_world_size(ParallelMode.TENSOR) + size_key = [(3 * hidden_size // world_size, hidden_size), (mlp_hidden_size // world_size, hidden_size), (hidden_size // world_size, mlp_hidden_size), (hidden_size // world_size, hidden_size)] module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3'] for i in range(2): weight = {} @@ -131,21 +132,26 @@ def initialize_model(): if name == 'Wqkv': weight[name] = torch.zeros((3 * hidden_size, hidden_size), dtype=gpc.config.model.get("dtype", torch.half), - device='cuda').contiguous() + device=get_current_device()).contiguous() elif name == 'out_proj': weight[name] = torch.zeros((hidden_size, hidden_size), dtype=gpc.config.model.get("dtype", torch.half), - device='cuda').contiguous() + device=get_current_device()).contiguous() elif name == 'w1' or name == 'w2': weight[name] = torch.zeros((mlp_hidden_size, hidden_size), dtype=gpc.config.model.get("dtype", torch.half), - device='cuda').contiguous() + device=get_current_device()).contiguous() else: weight[name] = torch.zeros((hidden_size, mlp_hidden_size), dtype=gpc.config.model.get("dtype", torch.half), - device='cuda').contiguous() + device=get_current_device()).contiguous() block_memory[i] = weight + reduce_scatter_memory = {} + for key in size_key: + reduce_scatter_memory[key] = {'data': [], 'used': []} + gpc.config.block_memory = block_memory + gpc.config.reduce_scatter_memory = reduce_scatter_memory return model diff --git a/train.py b/train.py index c972bea9..41ab070d 100644 --- a/train.py +++ b/train.py @@ -299,6 +299,7 @@ def main(args): if gpc.config.fstp_handler is not None: gpc.config.fstp_handler.zero_const_pool = {} + gpc.config.fstp_handler.reduce_scatter_memory = {} torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() From 815a584930622d6c9c81508d41132a6413c86420 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 20 Oct 2023 11:27:59 +0800 Subject: [PATCH 035/153] feat(model/linear.py): remove useless code --- internlm/model/linear.py | 307 +++---------------------- internlm/model/modeling_internlm.py | 3 - internlm/model/multi_head_attention.py | 1 - internlm/model/utils.py | 152 +++++++----- internlm/train/training_internlm.py | 58 +++-- train.py | 2 +- 6 files changed, 166 insertions(+), 357 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 4f05cd32..61a5cfc1 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -177,7 +177,6 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - block_idx: int = 0, ): super().__init__() @@ -224,8 +223,14 @@ def forward(self, x): name_index = gpc.config.fstp_handler.module_name_index[self] name = gpc.config.fstp_handler.module_name[name_index] return fstp_fused_dense_func( - x, self.weight, self.bias, process_group=self.process_group, - module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name + x, + self.weight, + self.bias, + process_group=self.process_group, + module=self, + handler=gpc.config.fstp_handler, + block_index=block_index, + module_name=name, ) @@ -255,7 +260,6 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - block_idx: int = 0, ): super().__init__() @@ -296,129 +300,6 @@ def forward(self, x): return out -class FSTPAllGatherSyncHandler: - """ - All-gather handler for overlapping the all-gather in adjcent FSTP linear. - """ - - def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: - # import pdb; pdb.set_trace() - self.process_group = process_group - self.FSTP_modules = [] - self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward - self.module_handler = dict() # key: FSTP module; value: all-gather handler - self.module_block = dict() # key: FSTP module; value: transformer block index - self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} - self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name - - self.reduce_scatter_handlers = {} - self.all_reduce_handlers = {} - - # just want to share same for loop for ModuleList and Module - if not isinstance(model, nn.ModuleList): - model = [model] - - for _chunk in model: - if isinstance(_chunk, NaiveAMPModel): - _chunk = _chunk.model - - for _chunk_name, children in _chunk.named_children(): - if isinstance(children, nn.ModuleList): - for idx, block in enumerate(children): - index = 0 - self.block_module[idx] = {} - for _sub_name, sub in block.named_children(): - sub_modules = list(sub.children()) - if len(sub_modules) > 0: - for name, child in sub.named_children(): - if isinstance(child, FSTPLinear): - - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") - - self.FSTP_modules.append(child) - self.module_block[child] = idx - self.block_module[idx][index] = child - self.module_name_index[child] = index - index = index + 1 - else: - continue - - def _register_sync_parameters_hook(self) -> None: - """ - register pre_forward_hook and pre_backward_hook for FSTPLinear. - """ - - def _pre_forward_hook(module: nn.Module, inputs: Any): - block_index = self.module_block[module] - name_index = self.module_name_index[module] - if name_index == 0: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - else: - handler = self.module_handler[module] - handler.wait() - if name_index != 4: - next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - - def _post_forward_hook(module: nn.Module, input, output): - if module in self.FSTP_global_weights: - del self.FSTP_global_weights[module] - if module in self.module_handler: - del self.module_handler[module] - - def _pre_backward_hook(module: nn.Module, grad_output): - block_index = self.module_block[module] - name_index = self.module_name_index[module] - if name_index == 4: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - else: - handler = self.module_handler[module] - handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - - def _post_backward_hook(module, grad_input, grad_output): - del self.FSTP_global_weights[module] - - for module in self.FSTP_modules: - # import pdb; pdb.set_trace() - module.register_forward_pre_hook(_pre_forward_hook) - module.register_forward_hook(_post_forward_hook) - # module.register_backward_pre_hook(_pre_backward_hook) - # module.register_backward_hook(_post_backward_hook) - module.register_full_backward_pre_hook(_pre_backward_hook) - module.register_full_backward_hook(_post_backward_hook) - - class CoarseGrainedFSTPAllGatherSyncHandler: """ All-gather handler for overlapping the all-gather in adjcent FSTP block. @@ -479,49 +360,33 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.index_to_fsdp_modules[idx].append(child) self.module_name_index[child] = index index = index + 1 - + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") if child.bias is not None: setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") - # _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - # setattr(child.weight, "_fstp_all_reduce_str", f"{_full_name}.weight") - # if child.bias is not None: - # setattr(child.bias, "_fstp_all_reduce_str", f"{_full_name}.bias") else: continue elif isinstance(children, ScaleColumnParallelLinear): self.head.append(children) elif isinstance(children, Embedding1D): self.embedding.append(children) - - def get_zero_by_shape(self, size:tuple, dtype, device) -> torch.Tensor: - if size not in self.zero_const_pool: + + def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: + if size not in self.zero_const_pool: self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() - - return self.zero_const_pool[size] + return self.zero_const_pool[size] - def _all_gather_block_weight(self, block_index: int): - #block = self.index_to_block[block_index] - fsdp_modules = self.index_to_fsdp_modules[block_index] - # self.block_handles[block] = [] - for module in fsdp_modules: - total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) - self.FSTP_global_weights[module] = total_weight - self.FSTP_global_handle[module] = weight_handle - # self.block_handles[block].append(weight_handle) - def _all_gather_block_weight_memory_pool(self, block_index: int): fsdp_modules = self.index_to_fsdp_modules[block_index] - # self.block_handles[block] = [] for module in fsdp_modules: module_index = self.module_name_index[module] name = self.module_name[module_index] - weight_handle = all_gather_raw_memory_pool(module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name) - # self.FSTP_global_weights[module] = total_weight + weight_handle = all_gather_raw_memory_pool( + module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name + ) self.FSTP_global_handle[module] = weight_handle - # self.block_handles[block].append(weight_handle) def _register_sync_parameters_hook(self) -> None: """ @@ -538,41 +403,14 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): block_index = self.module_to_index[module] # start the all-gather for next block if block_index + 1 < gpc.config.NUM_LAYER: - # self._all_gather_block_weight(block_index + 1) self._all_gather_block_weight_memory_pool(block_index + 1) - def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): - block_index = self.block_to_index[block] - if block_index == 0: - # all gather weight for block 0 - fsdp_modules = self.index_to_fsdp_modules[block_index] - for module in fsdp_modules: - total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handle.wait() - self.FSTP_global_weights[module] = total_weight - else: - # wait handle for current block - handles = self.block_handles[block] - for handle in handles: - handle.wait() - - def _pre_forward_hook_for_embedding(module: nn.Module, inputs: Any, output): - # self._all_gather_block_weight(0) + def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output): self._all_gather_block_weight_memory_pool(0) - - - def _post_forward_hook_for_block(block: nn.Module, input, output): - block_index = self.block_to_index[block] - fsdp_modules = self.index_to_fsdp_modules[block_index] - if block in self.block_handles: - del self.block_handles[block] - for module in fsdp_modules: - del self.FSTP_global_weights[module] - def _pre_forward_hook_for_module(module: nn.Module, inputs: Any,): - block_index = self.module_to_index[module] - handler = self.FSTP_global_handle[module] - handler.wait() + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): + handle = self.FSTP_global_handle[module] + handle.wait() def _post_forward_hook_for_module(module: nn.Module, input, output): if module in self.FSTP_global_weights: @@ -580,67 +418,44 @@ def _post_forward_hook_for_module(module: nn.Module, input, output): if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] - def _pre_backward_hook_for_block(block: nn.Module, grad_output): - # import pdb; pdb.set_trace() - block_index = self.block_to_index[block] - # if block_index == gpc.config.NUM_LAYER - 1: - # # all gather weight for the last block - # fsdp_modules = self.index_to_fsdp_modules[block_index] - # for module in fsdp_modules: - # total_weight, weight_handle = all_gather_raw(module.weight, self.process_group, async_op=True) - # weight_handle.wait() - # self.FSTP_global_weights[module] = total_weight - # else: - # # wait handle for current block - # handles = self.block_handles[block] - # for handle in handles: - # handle.wait() - # if block_index == gpc.config.NUM_LAYER - 1: - # self._all_gather_block_weight(block_index) - # start the all-gather for next block - if block_index - 1 >= 0: - self._all_gather_block_weight(block_index - 1) - def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): first_module = self.block_module[gpc.config.NUM_LAYER - 1][4] total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True) self.FSTP_global_handle[first_module] = weight_handler self.FSTP_global_weights[first_module] = total_weight - def _post_backward_hook_for_block(block: nn.Module, grad_input, grad_output): - block_index = self.block_to_index[block] - fsdp_modules = self.index_to_fsdp_modules[block_index] - if block in self.block_handles: - del self.block_handles[block] - for module in fsdp_modules: - del self.FSTP_global_weights[module] - def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output): block_index = self.module_to_index[module] name_index = self.module_name_index[module] - + if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: - # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) weight_handler = self.FSTP_global_handle[module] weight_handler.wait() - # self.FSTP_global_weights[module] = total_weight # start the all-gather for next module next_module = self.block_module[block_index][name_index - 1] next_name = self.module_name[name_index - 1] weights_handler = all_gather_raw_memory_pool( - next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=next_name + next_module.weight, + self.process_group, + async_op=True, + block_index=block_index, + module_name=next_name, ) self.FSTP_global_handle[next_module] = weights_handler elif name_index == 0: handler = self.FSTP_global_handle[module] handler.wait() - + if block_index - 1 >= 0: next_module = self.block_module[block_index - 1][4] name = self.module_name[4] weights_handler = all_gather_raw_memory_pool( - next_module.weight, self.process_group, async_op=True, block_index=block_index - 1, module_name=name, + next_module.weight, + self.process_group, + async_op=True, + block_index=block_index - 1, + module_name=name, ) self.FSTP_global_handle[next_module] = weights_handler else: @@ -653,76 +468,24 @@ def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output): next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name ) self.FSTP_global_handle[next_module] = weights_handler - # if module in self.FSTP_global_handle: - # handler = self.FSTP_global_handle[module] - # handler.wait() - - def _pre_backward_hook_for_module(module: nn.Module, grad_output): - block_index = self.module_to_index[module] - name_index = self.module_name_index[module] - - if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: - # total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler = self.FSTP_global_handle[module] - weight_handler.wait() - # self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - elif name_index == 0: - handler = self.FSTP_global_handle[module] - handler.wait() - - if block_index - 1 >= 0: - next_module = self.block_module[block_index - 1][4] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - else: - handler = self.FSTP_global_handle[module] - handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.FSTP_global_handle[next_module] = weights_handler - # if module in self.FSTP_global_handle: - # handler = self.FSTP_global_handle[module] - # handler.wait() def _post_backward_hook_for_module(module, grad_input, grad_output): if module in self.FSTP_global_weights: del self.FSTP_global_weights[module] if module in self.FSTP_global_handle: del self.FSTP_global_handle[module] - + for embedding in self.embedding: - embedding.register_forward_hook(_pre_forward_hook_for_embedding) - + embedding.register_forward_hook(_post_forward_hook_for_embedding) + for head in self.head: head.register_full_backward_hook(_post_backward_hook_for_head) - # for block in self.FSTP_blocks: - # block.register_forward_pre_hook(_pre_forward_hook_for_block) - # block.register_forward_hook(_post_forward_hook_for_block) - # block.register_full_backward_pre_hook(_pre_backward_hook_for_block) - # block.register_full_backward_hook(_post_backward_hook_for_block) - for out_proj in self.FSTP_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) - - # for wqkv in self.FSTP_wqkvs: - # wqkv.register_full_backward_pre_hook(_pre_backward_hook_for_wqkv) for module in self.FSTP_modules: module.register_forward_pre_hook(_pre_forward_hook_for_module) module.register_forward_hook(_post_forward_hook_for_module) - # module.register_full_backward_pre_hook(_pre_backward_hook_for_module) module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool) module.register_full_backward_hook(_post_backward_hook_for_module) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index b004dffa..0df2b60e 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -78,7 +78,6 @@ def __init__( use_swiglu: bool = True, use_flash_attn: bool = True, tp_mode: str = "origin_tp", - block_idx: int = 0, ): super().__init__() self.checkpoint = checkpoint @@ -104,7 +103,6 @@ def __init__( device=device, dtype=dtype, tp_mode=tp_mode, - block_idx=block_idx, ) self.dropout1 = nn.Dropout(drop_rate) @@ -346,7 +344,6 @@ def __init__( use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, tp_mode=self.tp_mode, - block_idx=lid, ) for lid in range(num_layers) ] diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 7a0f4ed7..8dcd3f96 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -176,7 +176,6 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, tp_mode: str = "origin_tp", - block_idx: int = 0, ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 2667efed..b9c7c03a 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -6,15 +6,15 @@ import fused_dense_lib as fused_dense_cuda import torch import torch.nn.functional as F -from flash_attn.utils.distributed import all_reduce_raw #, reduce_scatter_raw +from flash_attn.utils.distributed import all_reduce_raw # , reduce_scatter_raw from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.utils.logger import get_logger from internlm.utils.common import get_current_device +from internlm.utils.logger import get_logger logger = get_logger(__file__) @@ -125,9 +125,20 @@ def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = ) return output, handle -def all_gather_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0, block_index: int = None, module_name: str = None): + +def all_gather_raw_memory_pool( + input_: Tensor, + process_group: ProcessGroup, + async_op: bool = False, + gather_dim: int = 0, + block_index: int = None, + module_name: str = None, +): handle = torch.distributed.all_gather_into_tensor( - gpc.config.block_memory[block_index % 2][module_name], input_.contiguous(), group=process_group, async_op=async_op + gpc.config.block_memory[block_index % 2][module_name], + input_.contiguous(), + group=process_group, + async_op=async_op, ) return handle @@ -142,23 +153,25 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 - output = torch.empty(input_.shape[0] // world_size, *input_.shape[1:], - dtype=input_.dtype, device=input_.device).contiguous() - handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(), - group=process_group, - async_op=async_op) + output = torch.empty( + input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device + ).contiguous() + handle = torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), group=process_group, async_op=async_op + ) return output, handle + def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 size = (input_.shape[0] // world_size, *input_.shape[1:]) index = check_reduce_scatter_memory_pool(size) - output = gpc.config.reduce_scatter_memory[size]['data'][index] + output = gpc.config.reduce_scatter_memory[size]["data"][index] setattr(output, "index", index) - handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(), - group=process_group, - async_op=async_op) + handle = torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), group=process_group, async_op=async_op + ) return output, handle @@ -313,7 +326,18 @@ class FSTPFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None): + def forward( + ctx, + x, + weight, + bias, + return_residual=False, + process_group=None, + module=None, + all_gather_handler=None, + block_index=None, + module_name=None, + ): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group @@ -329,9 +353,9 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all_gather for weight and bias before actual computation - if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights: - # total_weight = all_gather_handler.FSTP_global_weights[module] - total_weight = gpc.config.block_memory[block_index % 2][module_name] + if all_gather_handler is not None: # and module in all_gather_handler.FSTP_global_weights: + # total_weight = all_gather_handler.FSTP_global_weights[module] + total_weight = gpc.config.block_memory[block_index % 2][module_name] else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -376,7 +400,7 @@ def backward(ctx, grad_output, *args): module = ctx.module block_index = ctx.block_index module_name = ctx.module_name - + if ctx.compute_weight_gradient: x, weight, bias = ctx.saved_tensors total_x = x @@ -408,32 +432,43 @@ def backward(ctx, grad_output, *args): ) if world_size > 1: if gpc.config.fstp_handler is not None: - # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True) - # assert hasattr(weight, "_fstp_all_reduce_str") - # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async) - # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) - # if grad_bias is not None: - # grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True) - # assert hasattr(bias, "_fstp_all_reduce_str") - # all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async) - # grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) - - grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True) + grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( + grad_weight, process_group, async_op=True + ) assert hasattr(weight, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) - grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) + all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( + handle_grad_weight, + grad_weight_async, + ) + grad_weight = all_gather_handler.get_zero_by_shape( + ( + grad_weight.shape[0] // torch.distributed.get_world_size(process_group), + *grad_weight.shape[1:], + ), + dtype=grad_weight.dtype, + device=grad_weight.device, + ) if grad_bias is not None: - grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True) + grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( + grad_bias, process_group, async_op=True + ) assert hasattr(bias, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) - grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) + all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( + handle_grad_bias, + grad_bias_async, + ) + grad_bias = all_gather_handler.get_zero_by_shape( + ( + grad_bias.shape[0] // torch.distributed.get_world_size(process_group), + *grad_bias.shape[1:], + ), + dtype=grad_bias.dtype, + device=grad_bias.device, + ) else: grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) if grad_bias is not None: grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) - # if grad_bias is not None: - # grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -489,7 +524,9 @@ def fstp_fused_dense_func( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler, block_index, module_name) + return FSTPFusedDenseFunc.apply( + x, weight, bias, return_residual, process_group, module, handler, block_index, module_name + ) else: assert process_group is None out = F.linear(x, weight, bias) @@ -536,36 +573,37 @@ def Silu(w1_o, w2_o): Silu = torch.jit.script(Silu) + def check_reduce_scatter_memory_pool(key): - return_idx = 0 - + # if key not in dict if key not in gpc.config.reduce_scatter_memory: - gpc.config.reduce_scatter_memory[key] = {'data': [], 'used': []} - + gpc.config.reduce_scatter_memory[key] = {"data": [], "used": []} + # if the data is empty - if len(gpc.config.reduce_scatter_memory[key]['data']) == 0: - gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous()) - gpc.config.reduce_scatter_memory[key]['used'].append(True) + if len(gpc.config.reduce_scatter_memory[key]["data"]) == 0: + gpc.config.reduce_scatter_memory[key]["data"].append( + torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() + ) + gpc.config.reduce_scatter_memory[key]["used"].append(True) return_idx = 0 return return_idx - else: # if not empty - for index, used in enumerate(gpc.config.reduce_scatter_memory[key]['used']): - if used == False: - gpc.config.reduce_scatter_memory[key]['used'][index] = True + else: # if not empty + for index, used in enumerate(gpc.config.reduce_scatter_memory[key]["used"]): + if used is False: + gpc.config.reduce_scatter_memory[key]["used"][index] = True return_idx = index return return_idx # if the memory pool is all used - length = len(gpc.config.reduce_scatter_memory[key]['data']) - gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous()) - gpc.config.reduce_scatter_memory[key]['used'].append(True) + length = len(gpc.config.reduce_scatter_memory[key]["data"]) + gpc.config.reduce_scatter_memory[key]["data"].append( + torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() + ) + gpc.config.reduce_scatter_memory[key]["used"].append(True) return_idx = length return return_idx + def release_reduce_scatter_memory_pool(size, index): - gpc.config.reduce_scatter_memory[size]['used'][index] = False \ No newline at end of file + gpc.config.reduce_scatter_memory[size]["used"][index] = False diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2816da0e..5205ba5b 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -38,7 +38,6 @@ from internlm.model.linear import ( CoarseGrainedFSTPAllGatherSyncHandler, FeedForward, - FSTPAllGatherSyncHandler, RewardModelLinear, ScaleColumnParallelLinear, ) @@ -108,7 +107,7 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - + gpc.config.fstp_handler = None if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: @@ -116,40 +115,53 @@ def initialize_model(): # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler - + # allocate memory pool - block_memory = {} # containing two groups of block weight + block_memory = {} # containing two groups of block weight hidden_size = gpc.config.HIDDEN_SIZE mlp_ratio = gpc.config.MLP_RATIO mlp_hidden_size = int(hidden_size * mlp_ratio) mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) world_size = gpc.get_world_size(ParallelMode.TENSOR) - size_key = [(3 * hidden_size // world_size, hidden_size), (mlp_hidden_size // world_size, hidden_size), (hidden_size // world_size, mlp_hidden_size), (hidden_size // world_size, hidden_size)] - module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3'] + size_key = [ + (3 * hidden_size // world_size, hidden_size), + (mlp_hidden_size // world_size, hidden_size), + (hidden_size // world_size, mlp_hidden_size), + (hidden_size // world_size, hidden_size), + ] + module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] for i in range(2): weight = {} for name in module_name: - if name == 'Wqkv': - weight[name] = torch.zeros((3 * hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() - elif name == 'out_proj': - weight[name] = torch.zeros((hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() - elif name == 'w1' or name == 'w2': - weight[name] = torch.zeros((mlp_hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() + if name == "Wqkv": + weight[name] = torch.zeros( + (3 * hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + elif name == "out_proj": + weight[name] = torch.zeros( + (hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + elif name == "w1" or name == "w2": + weight[name] = torch.zeros( + (mlp_hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() else: - weight[name] = torch.zeros((hidden_size, mlp_hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() + weight[name] = torch.zeros( + (hidden_size, mlp_hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() block_memory[i] = weight reduce_scatter_memory = {} for key in size_key: - reduce_scatter_memory[key] = {'data': [], 'used': []} - + reduce_scatter_memory[key] = {"data": [], "used": []} + gpc.config.block_memory = block_memory gpc.config.reduce_scatter_memory = reduce_scatter_memory diff --git a/train.py b/train.py index 41ab070d..19a104ba 100644 --- a/train.py +++ b/train.py @@ -296,7 +296,7 @@ def main(args): memory_profiler.step() prof.step() - + if gpc.config.fstp_handler is not None: gpc.config.fstp_handler.zero_const_pool = {} gpc.config.fstp_handler.reduce_scatter_memory = {} From 95488d8e8f1737947c4f9a00f888d9f57e6ea606 Mon Sep 17 00:00:00 2001 From: "chenxun.p" Date: Fri, 20 Oct 2023 15:58:06 +0800 Subject: [PATCH 036/153] update optimizer accumulate grad impl when fstp --- .../core/scheduler/no_pipeline_scheduler.py | 1 - .../solver/optimizer/hybrid_zero_optim.py | 139 +++++++----------- 2 files changed, 54 insertions(+), 86 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index f0caf05c..56661d8c 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -194,7 +194,6 @@ def forward_backward_step( _output, _loss, _moe_loss = self._train_one_batch( _data, _label, engine, forward_only, return_loss, self._grad_accum_size ) - engine.optimizer.reset_reduce_bucket() if return_loss: loss += _loss diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 96a54c01..2c14c65d 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -2,6 +2,7 @@ # -*- encoding: utf-8 -*- import math +from typing import Optional, List from functools import partial import torch @@ -40,8 +41,20 @@ inf = math.inf logger = get_logger(__file__) + def print_memory(msg): - print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) + print( + msg, + " rank = ", + gpc.get_global_rank(), + " memory allocated: ", + torch.cuda.memory_allocated() / 1024 / 1024 / 1024, + " reverved memory: ", + torch.cuda.memory_reserved() / 1024 / 1024 / 1024, + " max memory: ", + torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, + flush=True, + ) print("===========================================") @@ -69,7 +82,7 @@ def __init__( backoff_factor = grad_scal_cfg.backoff_factor hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - + if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: self._fstp_handler = gpc.config.fstp_handler @@ -90,8 +103,8 @@ def __init__( # it will not manage the tensors used by mixed precision training self._param_store = ParameterStore(ParallelMode.ZERO1) self._grad_store = GradientStore(ParallelMode.DATA) - self._bucket_store = [] - self._bucket_store_2 = [] + self._bucket_store: List[BucketStore] = [] + self._accum_grad_buckets: List[BucketStore] = [] self._bucket_in_progress = [] # fp16 and fp32 params for mixed precision training @@ -160,7 +173,7 @@ def __init__( # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name self._broadcast_parallel_mode.append(zero_mode) self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"])) - self._bucket_store_2.append(BucketStore(group_id, param_group["dp_mode"])) + self._accum_grad_buckets.append(BucketStore(group_id, param_group["dp_mode"])) # assign parameters to ranks the params in the list are sorted params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) @@ -306,9 +319,9 @@ def _define_and_attach(param, reduce_rank=None): param=param, reduce_rank=reduce_rank, ) - + reduce_scatter_checker = partial( - self._wait_reduce_scatter_and_accumulate_grad, + self._wait_reduce_scatter_and_accumulate_grads, param=param, reduce_rank=reduce_rank, ) @@ -317,7 +330,7 @@ def _define_and_attach(param, reduce_rank=None): # NOT IMPORTANT BUT GOOD TO KNOW: # args here is not grad, but allow_unreacable and accumulate_grad def reduce_grad_hook(*args): # pylint: disable=W0613 - if gpc.config.fstp_handler is not None: + if self._fstp_handler is not None: reduce_scatter_checker() if self.skip_grad_reduce is False: @@ -341,84 +354,36 @@ def belongs_to_current_rank(self, param) -> bool: group_id = getattr(param, "group_id") return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) - def reset_reduce_bucket(self) -> None: - for bucket in self._bucket_store_2: - for rank, params in bucket._params.items(): - for _param in params: - if not hasattr(_param, "_fstp_reduce_scatter_str"): - continue - - key = getattr(_param, "_fstp_reduce_scatter_str") - comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] - comm_handle.wait() - _param.grad.add_(_grad) - # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) - del self._fstp_handler.reduce_scatter_handlers[key] - self._fstp_handler.reduce_scatter_handlers[key] = None - assert key in self._fstp_handler.reduce_scatter_handlers - # if not hasattr(_param, "_fstp_all_reduce_str"): - # continue - - # key = getattr(_param, "_fstp_all_reduce_str") - # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] - # comm_handle.wait() - # with torch.no_grad(): - # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) - # _param.grad.add_(_grad) - # # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - # del self._fstp_handler.all_reduce_handlers[key] - # self._fstp_handler.all_reduce_handlers[key] = None - # assert key in self._fstp_handler.all_reduce_handlers - - bucket.reset_by_rank(rank) - - def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): + def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: + for _param in bucket.get_param(reduce_rank): + if not hasattr(_param, "_fstp_reduce_scatter_str"): + continue + + # wait and accumulate gardient. + _key = getattr(_param, "_fstp_reduce_scatter_str") + _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key] + _comm_handle.wait() + _param.grad.add_(_grad) + + # release cuda memory. + self._fstp_handler.reduce_scatter_handlers[_key] = None + _grad = None + + bucket.reset_by_rank(reduce_rank) + + def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None): param_size = param.numel() + group_id = getattr(param, "group_id") + current_bucket = self._accum_grad_buckets[group_id] + # check if the bucket is full # if full, will reduce the grads already in the bucket # after reduction, the bucket will be empty - group_id = getattr(param, "group_id") - current_bucket = self._bucket_store_2[group_id] - - if current_bucket.num_elements_in_bucket(reduce_rank) >= 512 * 1024 * 1024: - # wait reduce scatter communication - params = current_bucket.get_param(reduce_rank) - for _param in params: - if not hasattr(_param, "_fstp_reduce_scatter_str"): - continue + if current_bucket.num_elements_in_bucket(reduce_rank) >= self._reduce_bucket_size: + self._accum_grads_store_in_bucket(current_bucket, reduce_rank) - key = getattr(_param, "_fstp_reduce_scatter_str") - comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[key] - comm_handle.wait() - _param.grad.add_(_grad) - # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) - del self._fstp_handler.reduce_scatter_handlers[key] - self._fstp_handler.reduce_scatter_handlers[key] = None - assert key in self._fstp_handler.reduce_scatter_handlers - - # if not hasattr(_param, "_fstp_all_reduce_str"): - # continue - - # key = getattr(_param, "_fstp_all_reduce_str") - # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] - # comm_handle.wait() - # with torch.no_grad(): - # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) - # _param.grad.add_(_grad) - # # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - # del self._fstp_handler.all_reduce_handlers[key] - # self._fstp_handler.all_reduce_handlers[key] = None - # assert key in self._fstp_handler.all_reduce_handlers - - current_bucket.reset_by_rank(reduce_rank) - + # otherwise, add the parameter into bucket. current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) current_bucket.add_param(param, reduce_rank) @@ -646,6 +611,10 @@ def step(self, closure=None): for group_id in range(self.num_param_groups): self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) + # we need to accumulate gradients left in the accumulate gardient bucket + for group_id in range(self.num_param_groups): + self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None) + # compute norm for gradients in the before bucket groups_norms = [] for group_id in range(self.num_param_groups): @@ -685,16 +654,16 @@ def step(self, closure=None): timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() - + print_memory("No 4") - + try: - res = self._step(closure=closure, norms=total_norms) + res = self._step(closure=closure, norms=total_norms) except torch.cuda.OutOfMemoryError as e: print(e, flush=True) print(torch.cuda.memory_summary(), flush=True) torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") - + return res def _step(self, closure=None, norms=None): @@ -822,7 +791,7 @@ def _step(self, closure=None, norms=None): torch.cuda.synchronize() with torch.cuda.stream(self._comm_bcast_stream): self.broadcast_params() - + timer("step").stop() # update gradients may not be needed here, because the sync_params function is used in initialization, From d91a5d9d9ec8c7b0444b533a6b44be4430c7c199 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 20 Oct 2023 15:59:40 +0800 Subject: [PATCH 037/153] feat(initialize/launch.py): refactor config for fstp --- configs/7B_sft.py | 10 ++--- internlm/initialize/launch.py | 23 ++++++---- internlm/model/modeling_internlm.py | 14 +++--- internlm/model/multi_head_attention.py | 8 ++-- .../solver/optimizer/hybrid_zero_optim.py | 45 ++++++++++++------- internlm/train/training_internlm.py | 3 +- internlm/utils/evaluation.py | 4 +- 7 files changed, 63 insertions(+), 44 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 6ea8b96e..c51c8129 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -152,19 +152,19 @@ 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. tensor parallel (dict): 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. pipeline parallel (dict): 1. size: int, the size of pipeline parallel. 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp", overlap=True), + tensor=dict(size=8, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, ) cudnn_deterministic = False diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 80611fee..0e74f76b 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -306,15 +306,20 @@ def args_sanity_check(): ), "sequence parallel does not support use_flash_attn=False" if isinstance(gpc.config.parallel["tensor"], int): - gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="origin_tp") - - if gpc.config.parallel["tensor"].get("mode", None) is None: - gpc.config.parallel["tensor"]["mode"] = "origin_tp" - - if gpc.config.parallel["tensor"].get("mode", None) == "fstp": - assert ( - gpc.config.parallel.sequence_parallel is True - ), "when the tp_mode is fstp, the sequence_parallel should be True." + gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], sp="none", intern_overlap=False) + if gpc.config.parallel["tensor"].get("sp", None) is None: + gpc.config.parallel["tensor"]["sp"] = "none" + if gpc.config.parallel["tensor"].get("intern_overlap", None) is None: + gpc.config.parallel["tensor"]["intern_overlap"] = False + assert gpc.config.parallel["tensor"].get("sp", None) in [ + "none", + "megatron", + "flash-attn", + "intern", + ], "invalid sp mode, only ['none', 'megatron', 'flash-attn', 'intern'] is supported" + # adapt to old version's sequence parallel config + if gpc.config.parallel["tensor"].get("sp", None) in ["megatron", "flash-attn", "intern"]: + gpc.config.parallel.sequence_parallel = True # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1: diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 0df2b60e..9b6420d4 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -77,7 +77,7 @@ def __init__( use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, - tp_mode: str = "origin_tp", + sp_mode: str = "none", ): super().__init__() self.checkpoint = checkpoint @@ -102,7 +102,7 @@ def __init__( use_flash_attn=use_flash_attn, device=device, dtype=dtype, - tp_mode=tp_mode, + sp_mode=sp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -114,7 +114,7 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward + mlp_cls = FSTPFeedForward if sp_mode == "intern" else FeedForward self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), @@ -297,7 +297,7 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) - self.tp_mode = gpc.config.parallel["tensor"]["mode"] + self.sp_mode = gpc.config.parallel["tensor"]["sp"] if is_reward: head_cls = RewardModelLinear @@ -343,7 +343,7 @@ def __init__( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, - tp_mode=self.tp_mode, + sp_mode=self.sp_mode, ) for lid in range(num_layers) ] @@ -389,8 +389,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] - # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension. - if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp": + # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension. + if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern": indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 8dcd3f96..cb0efb85 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -175,7 +175,7 @@ def __init__( use_flash_attn: bool = True, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - tp_mode: str = "origin_tp", + sp_mode: str = "none", ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -203,7 +203,7 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear + Wqkv_cls = FSTPLinear if sp_mode == "intern" else ColumnParallelLinearTorch self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, @@ -219,12 +219,12 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - if tp_mode == "fstp": + if sp_mode == "intern": self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group) self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) # output projection always have the bias (for now) - out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear + out_proj_cls = FSTPLinear if sp_mode == "intern" else RowParallelLinearTorch self.out_proj = out_proj_cls( embed_dim, embed_dim, diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 96a54c01..a4b31737 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -10,7 +10,10 @@ from internlm.core.context import Config, ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import split_forward_gather_backward, release_reduce_scatter_memory_pool +from internlm.model.utils import ( + release_reduce_scatter_memory_pool, + split_forward_gather_backward, +) from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -40,8 +43,20 @@ inf = math.inf logger = get_logger(__file__) + def print_memory(msg): - print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) + print( + msg, + " rank = ", + gpc.get_global_rank(), + " memory allocated: ", + torch.cuda.memory_allocated() / 1024 / 1024 / 1024, + " reverved memory: ", + torch.cuda.memory_reserved() / 1024 / 1024 / 1024, + " max memory: ", + torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, + flush=True, + ) print("===========================================") @@ -69,8 +84,8 @@ def __init__( backoff_factor = grad_scal_cfg.backoff_factor hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - - if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: + + if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: self._fstp_handler = gpc.config.fstp_handler # Zero related args @@ -306,7 +321,7 @@ def _define_and_attach(param, reduce_rank=None): param=param, reduce_rank=reduce_rank, ) - + reduce_scatter_checker = partial( self._wait_reduce_scatter_and_accumulate_grad, param=param, @@ -354,7 +369,7 @@ def reset_reduce_bucket(self) -> None: _param.grad.add_(_grad) # self._fstp_handler.reduce_scatter_handlers[key] = None # del _grad - release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) + release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index) del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers @@ -374,7 +389,7 @@ def reset_reduce_bucket(self) -> None: # assert key in self._fstp_handler.all_reduce_handlers bucket.reset_by_rank(rank) - + def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): param_size = param.numel() @@ -397,11 +412,11 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): _param.grad.add_(_grad) # self._fstp_handler.reduce_scatter_handlers[key] = None # del _grad - release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index) + release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index) del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers - + # if not hasattr(_param, "_fstp_all_reduce_str"): # continue @@ -418,7 +433,7 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): # assert key in self._fstp_handler.all_reduce_handlers current_bucket.reset_by_rank(reduce_rank) - + current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) current_bucket.add_param(param, reduce_rank) @@ -685,16 +700,16 @@ def step(self, closure=None): timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() - + print_memory("No 4") - + try: - res = self._step(closure=closure, norms=total_norms) + res = self._step(closure=closure, norms=total_norms) except torch.cuda.OutOfMemoryError as e: print(e, flush=True) print(torch.cuda.memory_summary(), flush=True) torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") - + return res def _step(self, closure=None, norms=None): @@ -822,7 +837,7 @@ def _step(self, closure=None, norms=None): torch.cuda.synchronize() with torch.cuda.stream(self._comm_bcast_stream): self.broadcast_params() - + timer("step").stop() # update gradients may not be needed here, because the sync_params function is used in initialization, diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 5205ba5b..53996b38 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -110,9 +110,8 @@ def initialize_model(): gpc.config.fstp_handler = None - if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: + if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) - # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() gpc.config.fstp_handler = handler diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 968a1db1..f708fa78 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -54,7 +54,7 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape def switch_sequence_parallel_mode(): prev_mode = gpc.config.parallel.sequence_parallel try: - if gpc.config.parallel["tensor"]["mode"] == "fstp": + if gpc.config.parallel["tensor"]["sp"] == "intern": gpc.config.parallel.sequence_parallel = True else: gpc.config.parallel.sequence_parallel = False @@ -106,7 +106,7 @@ def evaluate_on_val_dls( total_val_bsz = len(batch[1]) assert total_val_bsz % data_cfg.micro_bsz == 0 num_microbatches = total_val_bsz // data_cfg.micro_bsz - if gpc.config.parallel["tensor"]["mode"] == "fstp": + if gpc.config.parallel["tensor"]["sp"] == "intern": sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR) tensor_shape = torch.Size( [ From eac382ad0a0ed6075b31fbdb8a56d42239fa9f4f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 20 Oct 2023 16:22:29 +0800 Subject: [PATCH 038/153] feat(optimizer/hybrid_zero_optim.py): fix lint error --- internlm/model/utils.py | 5 ++--- internlm/solver/optimizer/hybrid_zero_optim.py | 5 +---- internlm/solver/optimizer/store.py | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index b9c7c03a..19531e4a 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -1,12 +1,12 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Any, Optional, Union +from typing import Optional import fused_dense_lib as fused_dense_cuda import torch import torch.nn.functional as F -from flash_attn.utils.distributed import all_reduce_raw # , reduce_scatter_raw +from flash_attn.utils.distributed import all_reduce_raw from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup @@ -397,7 +397,6 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.contiguous() process_group = ctx.process_group all_gather_handler = ctx.all_gather_handler - module = ctx.module block_index = ctx.block_index module_name = ctx.module_name diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index d5fec315..cb8aa659 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -11,10 +11,7 @@ from internlm.core.context import Config, ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import ( - release_reduce_scatter_memory_pool, - split_forward_gather_backward, -) +from internlm.model.utils import release_reduce_scatter_memory_pool from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py index 228045ed..f486ccec 100644 --- a/internlm/solver/optimizer/store.py +++ b/internlm/solver/optimizer/store.py @@ -45,7 +45,7 @@ def __init__(self, group_id, dp_parallel_mode): def num_elements_in_bucket(self, reduce_rank: int = None): return self._num_elements_in_bucket[reduce_rank] - + def num_params_in_bucket(self, reduce_rank: int = None): return len(self._params[reduce_rank]) From 2acf9b817f6888e73c3606ddc6549f8c95694b27 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 20 Oct 2023 16:25:08 +0800 Subject: [PATCH 039/153] feat(utils/gputest.py): fix lint error --- internlm/utils/gputest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 52d96385..bf4cf1c9 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -45,7 +45,7 @@ def empty_cache_and_diag(batch_count, interval=50): # # import time # # time.sleep(10) # print(e, "rank = ", gpc.get_global_rank(), flush=True) - # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") # do empty_cache after the bench torch.cuda.empty_cache() From dcd89ed30466b7552f79077af5049e3581d46270 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Fri, 20 Oct 2023 17:50:56 +0800 Subject: [PATCH 040/153] refactor linear --- configs/7B_sft.py | 2 +- internlm/model/linear.py | 350 ++++++++---------- internlm/model/modeling_internlm.py | 24 +- internlm/model/multi_head_attention.py | 12 +- internlm/model/utils.py | 206 +++++++++-- .../solver/optimizer/hybrid_zero_optim.py | 56 +-- internlm/train/training_internlm.py | 3 +- train.py | 4 +- 8 files changed, 357 insertions(+), 300 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 6ea8b96e..0058e04f 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -162,7 +162,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp", overlap=True), + tensor=dict(size=8, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=True, ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 4f05cd32..8f57a02a 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -19,25 +19,26 @@ all_gather_raw_memory_pool, fstp_fused_dense_func, fused_dense_func_torch, + megatron_fused_dense_func_torch, ) -class ScaleColumnParallelLinear(nn.Linear): +class BaseScaleColumnParallelLinear(nn.Linear): """ - ScaleColumnParallelLinear. - - Args: - in_features (int): size of each input sample - out_features (int): size of each output sample - process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. - bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False - in the config. - sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: - we do an all_gather of x before doing the matmul. - If not, then the input is already gathered. - device (Optional[Union[str, torch.device]]): The device will be used. - dtype (Optional[torch.dtype]): The type of data. - weight_scale (int): For training stability. 1 by default. + Base class for ScaleColumnParallelLinear. + + Args: + in_features (int): size of each input sample + out_features (int): size of each output sample + process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. + bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False + in the config. + sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + we do an all_gather of x before doing the matmul. + If not, then the input is already gathered. + device (Optional[Union[str, torch.device]]): The device will be used. + dtype (Optional[torch.dtype]): The type of data. + weight_scale (int): For training stability. 1 by default. """ def __init__( @@ -57,6 +58,10 @@ def __init__( self.process_group = process_group self.weight_scale = weight_scale +class ScaleColumnParallelLinear(BaseScaleColumnParallelLinear): + """ + ScaleColumnParallelLinear in flash implementation. + """ def forward(self, input, gather_dim=0): # pylint: disable=W0622 # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. @@ -74,6 +79,27 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 gather_dim=gather_dim, ) +class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear): + """ + ScaleColumnParallelLinear in megatron implementation. + """ + + def forward(self, input, gather_dim=0): # pylint: disable=W0622 + # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + # we do an all_gather of x before doing the matmul. + # If not, then the input is already gathered. + if self.weight_scale != 1: + weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() + else: + weight = self.weight + return megatron_fused_dense_func_torch( + input, + weight, + self.bias, + process_group=self.process_group, + sequence_parallel=gpc.config.parallel.sequence_parallel, + gather_dim=gather_dim, + ) class RewardModelLinear(ScaleColumnParallelLinear): """ @@ -129,7 +155,6 @@ def forward(self, x, gather_dim=0): # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. # If not, then the input is already gathered. - return fused_dense_func_torch( x, self.weight, @@ -139,6 +164,19 @@ def forward(self, x, gather_dim=0): gather_dim=gather_dim, ) +class MegatronColumnParallelLinearTorch(ColumnParallelLinear): + def forward(self, x, gather_dim=0): + # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + # we do an all_gather of x before doing the matmul. + # If not, then the input is already gathered. + return megatron_fused_dense_func_torch( + x, + self.weight, + self.bias, + process_group=self.process_group, + sequence_parallel=self.sequence_parallel, + gather_dim=gather_dim, + ) class RowParallelLinearTorch(RowParallelLinear): def forward(self, x): @@ -150,10 +188,20 @@ def forward(self, x): reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce return reduce_fn(out, self.process_group) +class MegatronRowParallelLinearTorch(RowParallelLinear): + def forward(self, x): + """ + We're doing Tensor Parallel with sequence parallelism: we do the matmul and then + a reduce_scatter of the result. + """ + out = megatron_fused_dense_func_torch(x, self.weight, self.bias) + reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce + return reduce_fn(out, self.process_group) + -class FeedForward(nn.Module): +class BaseFeedForward(nn.Module): """ - FeedForward. + Base FeedForward in flash implementation. Args: in_features (int): size of each input sample @@ -177,13 +225,13 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - block_idx: int = 0, + colum_cls = None, + row_cls = None, ): super().__init__() - hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - self.w1 = ColumnParallelLinearTorch( + self.w1 = colum_cls( in_features, hidden_features, process_group, @@ -192,7 +240,7 @@ def __init__( device=device, dtype=dtype, ) - self.w2 = ColumnParallelLinearTorch( + self.w2 = colum_cls( in_features, hidden_features, process_group, @@ -201,7 +249,7 @@ def __init__( device=device, dtype=dtype, ) - self.w3 = RowParallelLinearTorch( + self.w3 = row_cls( hidden_features, out_features, process_group, @@ -217,6 +265,66 @@ def forward(self, x): out = self.w3(Silu(w1_o, w2_o)) return out +class FeedForward(BaseFeedForward): + """ + FeedForward in flash implementation. + + Args: + in_features (int): size of each input sample + hidden_features (int): size of hidden state of FFN + out_features (int): size of each output sample + process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. + bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False + in the config. + device (Optional[Union[str, torch.device]]): The device will be used. + dtype (Optional[torch.dtype]): The type of data. + multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default. + """ + + def __init__( + self, + in_features: int, + hidden_features: int, + out_features: int = None, + process_group: Optional[torch.distributed.ProcessGroup] = None, + bias: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + multiple_of: int = 256, + ): + super().__init__(in_features, hidden_features, out_features, process_group, bias, device, + dtype, multiple_of, ColumnParallelLinearTorch, RowParallelLinearTorch) + + +class MegatronFeedForward(BaseFeedForward): + """ + FeedForward in megatron implementation. + + Args: + in_features (int): size of each input sample + hidden_features (int): size of hidden state of FFN + out_features (int): size of each output sample + process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. + bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False + in the config. + device (Optional[Union[str, torch.device]]): The device will be used. + dtype (Optional[torch.dtype]): The type of data. + multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default. + """ + + def __init__( + self, + in_features: int, + hidden_features: int, + out_features: int = None, + process_group: Optional[torch.distributed.ProcessGroup] = None, + bias: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + multiple_of: int = 256, + ): + super().__init__(in_features, hidden_features, out_features, process_group, bias, device, + dtype, multiple_of, MegatronColumnParallelLinearTorch, MegatronRowParallelLinearTorch) class FSTPLinear(ColumnParallelLinear): def forward(self, x): @@ -228,10 +336,9 @@ def forward(self, x): module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name ) - -class FSTPFeedForward(nn.Module): +class FSTPFeedForward(BaseFeedForward): """ - FeedForward. + FeedForward in FSTP. Args: in_features (int): size of each input sample @@ -255,169 +362,35 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - block_idx: int = 0, ): - super().__init__() - - hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - - self.w1 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w2 = FSTPLinear( - in_features, - hidden_features, - process_group, - bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - self.w3 = FSTPLinear( - hidden_features, - out_features, - process_group, - bias=bias, - sequence_parallel=gpc.config.parallel.sequence_parallel, - device=device, - dtype=dtype, - ) - - def forward(self, x): - w1_o = self.w1(x) - w2_o = self.w2(x) - out = self.w3(F.silu(w1_o) * w2_o) - return out - - -class FSTPAllGatherSyncHandler: - """ - All-gather handler for overlapping the all-gather in adjcent FSTP linear. - """ - - def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: - # import pdb; pdb.set_trace() - self.process_group = process_group - self.FSTP_modules = [] - self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward - self.module_handler = dict() # key: FSTP module; value: all-gather handler - self.module_block = dict() # key: FSTP module; value: transformer block index - self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} - self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name - - self.reduce_scatter_handlers = {} - self.all_reduce_handlers = {} - - # just want to share same for loop for ModuleList and Module - if not isinstance(model, nn.ModuleList): - model = [model] - - for _chunk in model: - if isinstance(_chunk, NaiveAMPModel): - _chunk = _chunk.model - - for _chunk_name, children in _chunk.named_children(): - if isinstance(children, nn.ModuleList): - for idx, block in enumerate(children): - index = 0 - self.block_module[idx] = {} - for _sub_name, sub in block.named_children(): - sub_modules = list(sub.children()) - if len(sub_modules) > 0: - for name, child in sub.named_children(): - if isinstance(child, FSTPLinear): - - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") - - self.FSTP_modules.append(child) - self.module_block[child] = idx - self.block_module[idx][index] = child - self.module_name_index[child] = index - index = index + 1 - else: - continue - - def _register_sync_parameters_hook(self) -> None: - """ - register pre_forward_hook and pre_backward_hook for FSTPLinear. - """ - - def _pre_forward_hook(module: nn.Module, inputs: Any): - block_index = self.module_block[module] - name_index = self.module_name_index[module] - if name_index == 0: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - else: - handler = self.module_handler[module] - handler.wait() - if name_index != 4: - next_module = self.block_module[block_index][name_index + 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - - def _post_forward_hook(module: nn.Module, input, output): - if module in self.FSTP_global_weights: - del self.FSTP_global_weights[module] - if module in self.module_handler: - del self.module_handler[module] - - def _pre_backward_hook(module: nn.Module, grad_output): - block_index = self.module_block[module] - name_index = self.module_name_index[module] - if name_index == 4: - total_weight, weight_handler = all_gather_raw(module.weight, self.process_group, async_op=True) - weight_handler.wait() - self.FSTP_global_weights[module] = total_weight - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - else: - handler = self.module_handler[module] - handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - self.FSTP_global_weights[next_module], weights_handler = all_gather_raw( - next_module.weight, self.process_group, async_op=True - ) - self.module_handler[next_module] = weights_handler - - def _post_backward_hook(module, grad_input, grad_output): - del self.FSTP_global_weights[module] - - for module in self.FSTP_modules: - # import pdb; pdb.set_trace() - module.register_forward_pre_hook(_pre_forward_hook) - module.register_forward_hook(_post_forward_hook) - # module.register_backward_pre_hook(_pre_backward_hook) - # module.register_backward_hook(_post_backward_hook) - module.register_full_backward_pre_hook(_pre_backward_hook) - module.register_full_backward_hook(_post_backward_hook) - + super().__init__(in_features, hidden_features, out_features, process_group, bias, device, + dtype, multiple_of, FSTPLinear, FSTPLinear) + +def get_mlp_cls(sp_mode: str): + if sp_mode in ["none", "flash-attn"]: + mlp_cls = FeedForward + elif sp_mode == "megatron": + mlp_cls = MegatronFeedForward + else: + mlp_cls = FSTPFeedForward + return mlp_cls + +def get_linear_cls(sp_mode: str, parallel_mode: str): + if parallel_mode == "column": + if sp_mode in ["none", "flash-attn"]: + cls = ColumnParallelLinearTorch + elif sp_mode == "megatron": + cls = MegatronColumnParallelLinearTorch + else: + cls = FSTPLinear + elif parallel_mode == 'row': + if sp_mode in ["none", "flash-attn"]: + cls = RowParallelLinearTorch + elif sp_mode == "megatron": + cls = MegatronRowParallelLinearTorch + else: + cls = FSTPLinear + return cls class CoarseGrainedFSTPAllGatherSyncHandler: """ @@ -468,7 +441,6 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non sub_modules = list(sub.children()) if len(sub_modules) > 0: for name, child in sub.named_children(): - # print(f"name: {name}", flush=True) if name == "out_proj": self.FSTP_outs.append(child) self.module_to_index[child] = idx diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index b004dffa..99d540fd 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -15,9 +15,12 @@ from internlm.model.embedding import Embedding1D from internlm.model.linear import ( FeedForward, + MegatronFeedForward, FSTPFeedForward, RewardModelLinear, ScaleColumnParallelLinear, + MegatronScaleColumnParallelLinear, + get_mlp_cls, ) from internlm.model.multi_head_attention import MHA from internlm.model.utils import ( @@ -77,8 +80,7 @@ def __init__( use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, - tp_mode: str = "origin_tp", - block_idx: int = 0, + sp_mode: str = "none", ): super().__init__() self.checkpoint = checkpoint @@ -103,8 +105,7 @@ def __init__( use_flash_attn=use_flash_attn, device=device, dtype=dtype, - tp_mode=tp_mode, - block_idx=block_idx, + sp_mode=sp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -116,7 +117,7 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - mlp_cls = FeedForward if tp_mode == "origin_tp" else FSTPFeedForward + mlp_cls = get_mlp_cls(sp_mode) self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), @@ -299,12 +300,16 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) - self.tp_mode = gpc.config.parallel["tensor"]["mode"] + self.sp_mode = gpc.config.parallel["tensor"]["sp"] + if self.sp_mode == "none": + gpc.config.parallel.sequence_parallel = False + else: + gpc.config.parallel.sequence_parallel = True if is_reward: head_cls = RewardModelLinear else: - head_cls = ScaleColumnParallelLinear + head_cls = ScaleColumnParallelLinear if self.sp_mode in ["flash-attn", "none", "intern"] else MegatronScaleColumnParallelLinear if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -345,8 +350,7 @@ def __init__( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, - tp_mode=self.tp_mode, - block_idx=lid, + sp_mode=self.sp_mode, ) for lid in range(num_layers) ] @@ -393,7 +397,7 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] # if the tensor parallel mode is 'fstp', the indexes should also be split in sequence dimension. - if gpc.config.parallel.sequence_parallel and self.tp_mode == "fstp": + if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern": indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 7a0f4ed7..8ba49edd 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -42,6 +42,9 @@ ColumnParallelLinearTorch, FSTPLinear, RowParallelLinearTorch, + MegatronColumnParallelLinearTorch, + MegatronRowParallelLinearTorch, + get_linear_cls, ) @@ -175,8 +178,7 @@ def __init__( use_flash_attn: bool = True, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - tp_mode: str = "origin_tp", - block_idx: int = 0, + sp_mode: str = "none", ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -204,7 +206,7 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - Wqkv_cls = ColumnParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear + Wqkv_cls = get_linear_cls(sp_mode, "column") self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, @@ -220,12 +222,12 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - if tp_mode == "fstp": + if sp_mode == "intern": self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group) self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) # output projection always have the bias (for now) - out_proj_cls = RowParallelLinearTorch if tp_mode == "origin_tp" else FSTPLinear + out_proj_cls = get_linear_cls(sp_mode, 'row') self.out_proj = out_proj_cls( embed_dim, embed_dim, diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 2667efed..6757906c 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -164,7 +164,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): - "tp fused dense function" + "FusedDenseFunc for tensor parallel in flash-attn implementation." @staticmethod @custom_fwd @@ -255,9 +255,96 @@ def backward(ctx, grad_output, *args): return grad_input, grad_weight, grad_bias, None, None, None, None +class MegatronFusedDenseFunc(torch.autograd.Function): + ''' + FusedDenseFunc for tensor parallel in megatron implementation. + The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron, + so that the all-gather in backward is ommited. + ''' + + @staticmethod + @custom_fwd + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0): + """ + If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel + with sequence parallelism: we do an all_gather_raw of x before doing the matmul. + """ + ctx.compute_weight_gradient = weight.requires_grad + ctx.return_residual = return_residual + ctx.process_group = process_group + ctx.sequence_parallel = sequence_parallel + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + x = x.contiguous() + if process_group is not None and sequence_parallel: + # We want to kick off the all_gather early, before weight dtype conversion + total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim) + else: + total_x = x + + if torch.is_autocast_enabled(): + weight = weight.to(dtype=torch.get_autocast_gpu_dtype()) + bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + weight = weight.contiguous() + if process_group is not None and sequence_parallel: + handle_x.wait() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *weight.shape) > 65535 * 32: + raise RuntimeError("fused_dense only supports matrix dims <= 2M") + output = F.linear(total_x, weight, bias) + if ctx.compute_weight_gradient: + ctx.save_for_backward(total_x, weight) + else: + ctx.save_for_backward(weight) + return output if not return_residual else (output, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + (grad_input,) = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + sequence_parallel = ctx.sequence_parallel + + if ctx.compute_weight_gradient: + total_x, weight = ctx.saved_tensors + else: + (weight,) = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + if process_group is not None: + reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw + grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) + else: + grad_input = None + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + if process_group is not None and ctx.needs_input_grad[0]: + handle_grad_input.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFuncTorch(FusedDenseFunc): - """A custom PyTorch module extending FusedDenseFunc.""" + '''FusedDenseFunc in flash implementation for supporting torch.float32''' @staticmethod @custom_bwd @@ -307,17 +394,61 @@ def backward(ctx, grad_output, *args): handle_grad_input.wait() return grad_input, grad_weight, grad_bias, None, None, None, None +class MegatronFusedDenseFuncTorch(FusedDenseFunc): + '''FusedDenseFunc in megatron implementation for supporting torch.float32''' + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + (grad_input,) = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + sequence_parallel = ctx.sequence_parallel + gather_dim = ctx.gather_dim + if ctx.compute_weight_gradient: + total_x, weight = ctx.saved_tensors + else: + (weight,) = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + if process_group is not None: + reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw + grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) + else: + grad_input = None + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + # we remove the cuda independence, which is different from flash_attn. + grad_weight, grad_bias = linear_bias_wgrad_torch( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + if process_group is not None and ctx.needs_input_grad[0]: + handle_grad_input.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None class FSTPFusedDenseFunc(torch.autograd.Function): - "FSTP fused dense function" + "FusedDenseFunc for FSTP, which is optimized based on flash implementation." @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, all_gather_handler=None, block_index=None, module_name=None): + def forward(ctx, x, weight, bias, return_residual=False, process_group=None, module=None, overlap_handler=None, block_index=None, module_name=None): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group - ctx.all_gather_handler = all_gather_handler + ctx.overlap_handler = overlap_handler ctx.module = module ctx.block_index = block_index ctx.module_name = module_name @@ -329,13 +460,12 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: # do all_gather for weight and bias before actual computation - if all_gather_handler is not None:# and module in all_gather_handler.FSTP_global_weights: - # total_weight = all_gather_handler.FSTP_global_weights[module] - total_weight = gpc.config.block_memory[block_index % 2][module_name] + if overlap_handler is not None: + total_weight = gpc.config.block_memory[block_index % 2][module_name] else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() - + # TODO memory pool for bias if bias is not None: total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) handle_bias.wait() @@ -356,6 +486,7 @@ def forward(ctx, x, weight, bias, return_residual=False, process_group=None, mod if min(batch_dim, n, *total_weight.shape) > 65535 * 32: raise RuntimeError("fused_dense only supports matrix dims <= 2M") output = F.linear(total_x, total_weight, total_bias) + # release memory del total_weight del total_bias if ctx.compute_weight_gradient: @@ -372,8 +503,7 @@ def backward(ctx, grad_output, *args): (grad_input,) = args grad_input = grad_input.contiguous() process_group = ctx.process_group - all_gather_handler = ctx.all_gather_handler - module = ctx.module + overlap_handler = ctx.overlap_handler block_index = ctx.block_index module_name = ctx.module_name @@ -389,51 +519,35 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: - total_weight = gpc.config.block_memory[block_index % 2][module_name] - # # do all-gather for weight before backward - # if module in all_gather_handler.FSTP_global_weights: - # total_weight = all_gather_handler.FSTP_global_weights[module] - # else: - # total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - # handle_weight.wait() + if overlap_handler is not None: + total_weight = gpc.config.block_memory[block_index % 2][module_name] + else: + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() else: total_weight = weight # compute weight grad if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient - grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - if gpc.config.fstp_handler is not None: - # grad_weight_async, handle_grad_weight = all_reduce_raw(grad_weight, process_group, async_op=True) - # assert hasattr(weight, "_fstp_all_reduce_str") - # all_gather_handler.all_reduce_handlers[weight._fstp_all_reduce_str] = (handle_grad_weight, grad_weight_async) - # grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) - # if grad_bias is not None: - # grad_bias_async, handle_grad_bias = all_reduce_raw(grad_bias, process_group, async_op=True) - # assert hasattr(bias, "_fstp_all_reduce_str") - # all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async) - # grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) - + if overlap_handler is not None: grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True) assert hasattr(weight, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) - grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) + overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) + grad_weight = overlap_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) if grad_bias is not None: grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True) assert hasattr(bias, "_fstp_reduce_scatter_str") - all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) - grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) + overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) + grad_bias = overlap_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) else: grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) if grad_bias is not None: grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) - # grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) - # if grad_bias is not None: - # grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -449,7 +563,7 @@ def backward(ctx, grad_output, *args): del total_weight if ctx.needs_input_grad[1]: - if world_size > 1 and gpc.config.fstp_handler is None: + if world_size > 1 and overlap_handler is None: handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() @@ -473,6 +587,22 @@ def fused_dense_func_torch( else: return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) +def megatron_fused_dense_func_torch( + x: Tensor, + weight: Tensor, + bias: Optional[Tensor] = None, + return_residual: bool = False, + process_group: Optional[ProcessGroup] = None, + sequence_parallel: bool = True, + gather_dim: int = 0, +): + dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( + x.dtype == torch.float32 and torch.is_autocast_enabled() + ) + if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: + return MegatronFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + else: + return MegatronFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) def fstp_fused_dense_func( x: Tensor, diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 96a54c01..4de5c7cd 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -40,11 +40,6 @@ inf = math.inf logger = get_logger(__file__) -def print_memory(msg): - print(msg, " rank = ", gpc.get_global_rank(), " memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024, " reverved memory: ", torch.cuda.memory_reserved() / 1024 / 1024 / 1024, " max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) - print("===========================================") - - class HybridZeroOptimizer(BaseOptimizer): """ Hybrid Zero Optimizer. @@ -70,7 +65,7 @@ def __init__( hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: + if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] == True: self._fstp_handler = gpc.config.fstp_handler # Zero related args @@ -358,20 +353,7 @@ def reset_reduce_bucket(self) -> None: del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers - # if not hasattr(_param, "_fstp_all_reduce_str"): - # continue - - # key = getattr(_param, "_fstp_all_reduce_str") - # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] - # comm_handle.wait() - # with torch.no_grad(): - # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) - # _param.grad.add_(_grad) - # # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - # del self._fstp_handler.all_reduce_handlers[key] - # self._fstp_handler.all_reduce_handlers[key] = None - # assert key in self._fstp_handler.all_reduce_handlers + bucket.reset_by_rank(rank) @@ -401,21 +383,6 @@ def _wait_reduce_scatter_and_accumulate_grad(self, param, reduce_rank=None): del self._fstp_handler.reduce_scatter_handlers[key] self._fstp_handler.reduce_scatter_handlers[key] = None assert key in self._fstp_handler.reduce_scatter_handlers - - # if not hasattr(_param, "_fstp_all_reduce_str"): - # continue - - # key = getattr(_param, "_fstp_all_reduce_str") - # comm_handle, _grad = self._fstp_handler.all_reduce_handlers[key] - # comm_handle.wait() - # with torch.no_grad(): - # _grad = split_forward_gather_backward(_grad, ParallelMode.TENSOR, dim=0) - # _param.grad.add_(_grad) - # # self._fstp_handler.reduce_scatter_handlers[key] = None - # del _grad - # del self._fstp_handler.all_reduce_handlers[key] - # self._fstp_handler.all_reduce_handlers[key] = None - # assert key in self._fstp_handler.all_reduce_handlers current_bucket.reset_by_rank(reduce_rank) @@ -634,7 +601,6 @@ def step(self, closure=None): # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients - print_memory("No 1") if not self._overlap_sync_grad: for group_id in range(len(self._fp16_param_groups)): for param in self._fp16_param_groups[group_id]: @@ -659,7 +625,6 @@ def step(self, closure=None): bucket.empty() self._bucket_in_progress = [] self._param_store.clear_grads_of_previous_reduced_params() - print_memory("No 2") # compute norm for gradients in the last bucket total_norms = {} for group_id in range(self.num_param_groups): @@ -681,19 +646,11 @@ def step(self, closure=None): scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float) dist.all_reduce(scaled_norm_tensor, group=pg) total_norms[group_name] = scaled_norm_tensor.item() - print_memory("No 3") timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() - print_memory("No 4") - - try: - res = self._step(closure=closure, norms=total_norms) - except torch.cuda.OutOfMemoryError as e: - print(e, flush=True) - print(torch.cuda.memory_summary(), flush=True) - torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + res = self._step(closure=closure, norms=total_norms) return res @@ -740,7 +697,6 @@ def _step(self, closure=None, norms=None): self._grad_store._averaged_gradients = dict() self.zero_grad() return False, norms - print_memory("No 5") # copy the grad of fp16 param to fp32 param single_grad_partition_groups = [] for group_id in range(self.num_param_groups): @@ -781,7 +737,6 @@ def _step(self, closure=None, norms=None): single_grad_partition_groups.append(flat_fp32_avg_grads) device = self._fp32_flat_param_groups_of_current_rank[group_id].device self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device) - print_memory("No 6") # unscale and clip grads # get the global norm global_norm_groups = {} @@ -804,12 +759,9 @@ def _step(self, closure=None, norms=None): # For those ranks that are not assigned parameters, we just wait for other ranks # to send them updated their own parameters. if self.has_params: - print_memory("No 7") self.optim.step() - print_memory("No 8") # release the fp32 grad release_param_grad(self._fp32_flat_param_groups_of_current_rank.values()) - print_memory("No 9") # update fp16 partition updated by the current rank for group_id in range(len(self._fp16_param_groups)): if self.param_group_has_params[group_id]: @@ -818,7 +770,6 @@ def _step(self, closure=None, norms=None): ) fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] fp16_param.data.copy_(fp32_param) - print_memory("No 10") torch.cuda.synchronize() with torch.cuda.stream(self._comm_bcast_stream): self.broadcast_params() @@ -829,7 +780,6 @@ def _step(self, closure=None, norms=None): # so synchronization is maintained for group_name, global_norm in global_norm_groups.items(): global_norm_groups[group_name] = global_norm / loss_scale - print_memory("No 11") return True, global_norm_groups def broadcast_params(self): diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2816da0e..20592c26 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -38,7 +38,6 @@ from internlm.model.linear import ( CoarseGrainedFSTPAllGatherSyncHandler, FeedForward, - FSTPAllGatherSyncHandler, RewardModelLinear, ScaleColumnParallelLinear, ) @@ -111,7 +110,7 @@ def initialize_model(): gpc.config.fstp_handler = None - if gpc.config.parallel["tensor"]["mode"] == "fstp" and gpc.config.parallel["tensor"]["overlap"] == True: + if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] == True: handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) # handler = FSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) handler._register_sync_parameters_hook() diff --git a/train.py b/train.py index 41ab070d..a917d121 100644 --- a/train.py +++ b/train.py @@ -195,7 +195,7 @@ def main(args): # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps): empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval) - torch.cuda.memory._record_memory_history() + # torch.cuda.memory._record_memory_history() start_time = time.time() timer("one-batch").start() @@ -300,7 +300,7 @@ def main(args): if gpc.config.fstp_handler is not None: gpc.config.fstp_handler.zero_const_pool = {} gpc.config.fstp_handler.reduce_scatter_memory = {} - torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() ckpt_manager.wait_async_upload_finish() From 85ad917ae430c2e89cf4444221c2ced9223d3552 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 20 Oct 2023 21:50:32 +0800 Subject: [PATCH 041/153] feat(model/overlap_handler.py): refactor overlap hook handle --- configs/7B_sft.py | 2 +- internlm/model/linear.py | 298 +++++------------- internlm/model/modeling_internlm.py | 11 +- internlm/model/multi_head_attention.py | 11 +- internlm/model/overlap_handler.py | 253 +++++++++++++++ internlm/model/utils.py | 98 +++--- .../solver/optimizer/hybrid_zero_optim.py | 12 +- internlm/train/training_internlm.py | 56 +--- train.py | 2 +- 9 files changed, 393 insertions(+), 350 deletions(-) create mode 100644 internlm/model/overlap_handler.py diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 09af7f45..c51c8129 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=True), + tensor=dict(size=8, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 2bbb9416..6cd3b9c8 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -1,22 +1,17 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Any, Optional, Union +from typing import Optional import torch -import torch.nn.functional as F from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear from flash_attn.utils.distributed import all_reduce, reduce_scatter from torch import nn from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.core.naive_amp import NaiveAMPModel -from internlm.model.embedding import Embedding1D from internlm.model.utils import ( Silu, - all_gather_raw, - all_gather_raw_memory_pool, fstp_fused_dense_func, fused_dense_func_torch, megatron_fused_dense_func_torch, @@ -25,20 +20,20 @@ class BaseScaleColumnParallelLinear(nn.Linear): """ - Base class for ScaleColumnParallelLinear. - - Args: - in_features (int): size of each input sample - out_features (int): size of each output sample - process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. - bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False - in the config. - sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: - we do an all_gather of x before doing the matmul. - If not, then the input is already gathered. - device (Optional[Union[str, torch.device]]): The device will be used. - dtype (Optional[torch.dtype]): The type of data. - weight_scale (int): For training stability. 1 by default. + Base class for ScaleColumnParallelLinear. + + Args: + in_features (int): size of each input sample + out_features (int): size of each output sample + process_group (Optional[torch.distributed.ProcessGroup]): The group of the current device for `parallel_mode`. + bias (bool): Whether the bias is needed for linears. True by default. But it is typically set to False + in the config. + sequence_parallel (bool): If sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + we do an all_gather of x before doing the matmul. + If not, then the input is already gathered. + device (Optional[Union[str, torch.device]]): The device will be used. + dtype (Optional[torch.dtype]): The type of data. + weight_scale (int): For training stability. 1 by default. """ def __init__( @@ -58,10 +53,12 @@ def __init__( self.process_group = process_group self.weight_scale = weight_scale + class ScaleColumnParallelLinear(BaseScaleColumnParallelLinear): """ ScaleColumnParallelLinear in flash implementation. """ + def forward(self, input, gather_dim=0): # pylint: disable=W0622 # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. @@ -79,6 +76,7 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 gather_dim=gather_dim, ) + class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear): """ ScaleColumnParallelLinear in megatron implementation. @@ -101,6 +99,7 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 gather_dim=gather_dim, ) + class RewardModelLinear(ScaleColumnParallelLinear): """ RewardModelLinear. @@ -164,6 +163,7 @@ def forward(self, x, gather_dim=0): gather_dim=gather_dim, ) + class MegatronColumnParallelLinearTorch(ColumnParallelLinear): def forward(self, x, gather_dim=0): # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: @@ -178,6 +178,7 @@ def forward(self, x, gather_dim=0): gather_dim=gather_dim, ) + class RowParallelLinearTorch(RowParallelLinear): def forward(self, x): """ @@ -188,6 +189,7 @@ def forward(self, x): reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce return reduce_fn(out, self.process_group) + class MegatronRowParallelLinearTorch(RowParallelLinear): def forward(self, x): """ @@ -225,8 +227,8 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - colum_cls = None, - row_cls = None, + colum_cls=None, + row_cls=None, ): super().__init__() hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) @@ -265,6 +267,7 @@ def forward(self, x): out = self.w3(Silu(w1_o, w2_o)) return out + class FeedForward(BaseFeedForward): """ FeedForward in flash implementation. @@ -292,9 +295,19 @@ def __init__( dtype: Optional[torch.dtype] = None, multiple_of: int = 256, ): - super().__init__(in_features, hidden_features, out_features, process_group, bias, device, - dtype, multiple_of, ColumnParallelLinearTorch, RowParallelLinearTorch) - + super().__init__( + in_features, + hidden_features, + out_features, + process_group, + bias, + device, + dtype, + multiple_of, + ColumnParallelLinearTorch, + RowParallelLinearTorch, + ) + class MegatronFeedForward(BaseFeedForward): """ @@ -323,19 +336,35 @@ def __init__( dtype: Optional[torch.dtype] = None, multiple_of: int = 256, ): - super().__init__(in_features, hidden_features, out_features, process_group, bias, device, - dtype, multiple_of, MegatronColumnParallelLinearTorch, MegatronRowParallelLinearTorch) + super().__init__( + in_features, + hidden_features, + out_features, + process_group, + bias, + device, + dtype, + multiple_of, + MegatronColumnParallelLinearTorch, + MegatronRowParallelLinearTorch, + ) + class FSTPLinear(ColumnParallelLinear): def forward(self, x): block_index = gpc.config.fstp_handler.module_to_index[self] - name_index = gpc.config.fstp_handler.module_name_index[self] - name = gpc.config.fstp_handler.module_name[name_index] return fstp_fused_dense_func( - x, self.weight, self.bias, process_group=self.process_group, - module=self, handler=gpc.config.fstp_handler, block_index=block_index, module_name=name + x, + self.weight, + self.bias, + process_group=self.process_group, + module=self, + handler=gpc.config.fstp_handler, + block_index=block_index, + module_name=self._fstp_name, ) + class FSTPFeedForward(BaseFeedForward): """ FeedForward in FSTP. @@ -363,8 +392,19 @@ def __init__( dtype: Optional[torch.dtype] = None, multiple_of: int = 256, ): - super().__init__(in_features, hidden_features, out_features, process_group, bias, device, - dtype, multiple_of, FSTPLinear, FSTPLinear) + super().__init__( + in_features, + hidden_features, + out_features, + process_group, + bias, + device, + dtype, + multiple_of, + FSTPLinear, + FSTPLinear, + ) + def get_mlp_cls(sp_mode: str): if sp_mode in ["none", "flash-attn"]: @@ -375,6 +415,7 @@ def get_mlp_cls(sp_mode: str): mlp_cls = FSTPFeedForward return mlp_cls + def get_linear_cls(sp_mode: str, parallel_mode: str): if parallel_mode == "column": if sp_mode in ["none", "flash-attn"]: @@ -383,7 +424,7 @@ def get_linear_cls(sp_mode: str, parallel_mode: str): cls = MegatronColumnParallelLinearTorch else: cls = FSTPLinear - elif parallel_mode == 'row': + elif parallel_mode == "row": if sp_mode in ["none", "flash-attn"]: cls = RowParallelLinearTorch elif sp_mode == "megatron": @@ -391,192 +432,3 @@ def get_linear_cls(sp_mode: str, parallel_mode: str): else: cls = FSTPLinear return cls - -class CoarseGrainedFSTPAllGatherSyncHandler: - """ - All-gather handler for overlapping the all-gather in adjcent FSTP block. - """ - - def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: - # import pdb; pdb.set_trace() - self.process_group = process_group - self.FSTP_blocks = [] - self.FSTP_outs = [] - self.FSTP_modules = [] - self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.FSTP_global_handle = dict() # key: FSTP module; value: module global all-gather op handle - self.FSTP_global_weights = dict() # key: FSTP module; value: module global weight for forward - self.block_handles = dict() # key: transformer block; value: all-gather handles - self.module_to_index = dict() # key: FSTP module; value: transformer block index - self.block_to_index = dict() # key: transformer block; value: transformer block index - self.index_to_block = dict() # key: transformer block index; value: transformer block - self.index_to_fsdp_modules = dict() # key: transformer block index; value: fsdp modules - self.module_name_index = dict() # key: FSTP module; value: the name in index in self.module_name - self.block_module = dict() # key: transformer block index; value: {name_index: FSTP module} - self.head = [] - self.embedding = [] - - self.reduce_scatter_handlers = {} - self.all_reduce_handlers = {} - self.zero_const_pool = {} - - # just want to share same for loop for ModuleList and Module - if not isinstance(model, nn.ModuleList): - model = [model] - - for _chunk in model: - if isinstance(_chunk, NaiveAMPModel): - _chunk = _chunk.model - - for _chunk_name, children in _chunk.named_children(): - if isinstance(children, nn.ModuleList): - for idx, block in enumerate(children): - index = 0 - self.block_module[idx] = {} - self.FSTP_blocks.append(block) - self.block_to_index[block] = idx - self.index_to_block[idx] = block - self.index_to_fsdp_modules[idx] = [] - for _sub_name, sub in block.named_children(): - sub_modules = list(sub.children()) - if len(sub_modules) > 0: - for name, child in sub.named_children(): - if name == "out_proj": - self.FSTP_outs.append(child) - self.module_to_index[child] = idx - if isinstance(child, FSTPLinear): - self.module_to_index[child] = idx - self.block_module[idx][index] = child - self.FSTP_modules.append(child) - self.index_to_fsdp_modules[idx].append(child) - self.module_name_index[child] = index - index = index + 1 - - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") - else: - continue - elif isinstance(children, ScaleColumnParallelLinear): - self.head.append(children) - elif isinstance(children, Embedding1D): - self.embedding.append(children) - - def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: - if size not in self.zero_const_pool: - self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() - - return self.zero_const_pool[size] - - def _all_gather_block_weight_memory_pool(self, block_index: int): - fsdp_modules = self.index_to_fsdp_modules[block_index] - for module in fsdp_modules: - module_index = self.module_name_index[module] - name = self.module_name[module_index] - weight_handle = all_gather_raw_memory_pool( - module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name - ) - self.FSTP_global_handle[module] = weight_handle - - def _register_sync_parameters_hook(self) -> None: - """ - register pre_forward_hook and pre_backward_hook for FSTP block. - - Notice that next block's all_gather op should be after current block's all_to_all op, so we - 1. register pre_forward_hook @out_proj module to prefetch for next block - 2. register pre_forward_hook @block module to wait handles for next block - 3. register pre_backward_hook @wqkv module to prefetch for next block - 4. register pre_backward_hook @block module to wait handles for next block - """ - - def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): - block_index = self.module_to_index[module] - # start the all-gather for next block - if block_index + 1 < gpc.config.NUM_LAYER: - self._all_gather_block_weight_memory_pool(block_index + 1) - - def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output): - self._all_gather_block_weight_memory_pool(0) - - def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): - handle = self.FSTP_global_handle[module] - handle.wait() - - def _post_forward_hook_for_module(module: nn.Module, input, output): - if module in self.FSTP_global_weights: - del self.FSTP_global_weights[module] - if module in self.FSTP_global_handle: - del self.FSTP_global_handle[module] - - def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): - first_module = self.block_module[gpc.config.NUM_LAYER - 1][4] - total_weight, weight_handler = all_gather_raw(first_module.weight, self.process_group, async_op=True) - self.FSTP_global_handle[first_module] = weight_handler - self.FSTP_global_weights[first_module] = total_weight - - def _pre_backward_hook_for_module_memory_pool(module: nn.Module, grad_output): - block_index = self.module_to_index[module] - name_index = self.module_name_index[module] - - if name_index == 4 and block_index == gpc.config.NUM_LAYER - 1: - weight_handler = self.FSTP_global_handle[module] - weight_handler.wait() - - # start the all-gather for next module - next_module = self.block_module[block_index][name_index - 1] - next_name = self.module_name[name_index - 1] - weights_handler = all_gather_raw_memory_pool( - next_module.weight, - self.process_group, - async_op=True, - block_index=block_index, - module_name=next_name, - ) - self.FSTP_global_handle[next_module] = weights_handler - elif name_index == 0: - handler = self.FSTP_global_handle[module] - handler.wait() - - if block_index - 1 >= 0: - next_module = self.block_module[block_index - 1][4] - name = self.module_name[4] - weights_handler = all_gather_raw_memory_pool( - next_module.weight, - self.process_group, - async_op=True, - block_index=block_index - 1, - module_name=name, - ) - self.FSTP_global_handle[next_module] = weights_handler - else: - handler = self.FSTP_global_handle[module] - handler.wait() - if name_index != 0: - next_module = self.block_module[block_index][name_index - 1] - name = self.module_name[name_index - 1] - weights_handler = all_gather_raw_memory_pool( - next_module.weight, self.process_group, async_op=True, block_index=block_index, module_name=name - ) - self.FSTP_global_handle[next_module] = weights_handler - - def _post_backward_hook_for_module(module, grad_input, grad_output): - if module in self.FSTP_global_weights: - del self.FSTP_global_weights[module] - if module in self.FSTP_global_handle: - del self.FSTP_global_handle[module] - - for embedding in self.embedding: - embedding.register_forward_hook(_post_forward_hook_for_embedding) - - for head in self.head: - head.register_full_backward_hook(_post_backward_hook_for_head) - - for out_proj in self.FSTP_outs: - out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) - - for module in self.FSTP_modules: - module.register_forward_pre_hook(_pre_forward_hook_for_module) - module.register_forward_hook(_post_forward_hook_for_module) - module.register_full_backward_pre_hook(_pre_backward_hook_for_module_memory_pool) - module.register_full_backward_hook(_post_backward_hook_for_module) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 3ed78d79..228e1e1c 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -14,12 +14,9 @@ from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal from internlm.model.embedding import Embedding1D from internlm.model.linear import ( - FeedForward, - MegatronFeedForward, - FSTPFeedForward, + MegatronScaleColumnParallelLinear, RewardModelLinear, ScaleColumnParallelLinear, - MegatronScaleColumnParallelLinear, get_mlp_cls, ) from internlm.model.multi_head_attention import MHA @@ -309,7 +306,11 @@ def __init__( if is_reward: head_cls = RewardModelLinear else: - head_cls = ScaleColumnParallelLinear if self.sp_mode in ["flash-attn", "none", "intern"] else MegatronScaleColumnParallelLinear + head_cls = ( + ScaleColumnParallelLinear + if self.sp_mode in ["flash-attn", "none", "intern"] + else MegatronScaleColumnParallelLinear + ) if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 8ba49edd..93dbf010 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -38,14 +38,7 @@ from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode from internlm.core.context import global_context as gpc from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding -from internlm.model.linear import ( - ColumnParallelLinearTorch, - FSTPLinear, - RowParallelLinearTorch, - MegatronColumnParallelLinearTorch, - MegatronRowParallelLinearTorch, - get_linear_cls, -) +from internlm.model.linear import get_linear_cls # adpated from https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py @@ -227,7 +220,7 @@ def __init__( self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) # output projection always have the bias (for now) - out_proj_cls = get_linear_cls(sp_mode, 'row') + out_proj_cls = get_linear_cls(sp_mode, "row") self.out_proj = out_proj_cls( embed_dim, embed_dim, diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py new file mode 100644 index 00000000..cafb8183 --- /dev/null +++ b/internlm/model/overlap_handler.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from typing import Any, Union + +import torch +from torch import nn + +from internlm.core.context import global_context as gpc +from internlm.core.naive_amp import NaiveAMPModel +from internlm.model.embedding import Embedding1D +from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear +from internlm.model.utils import all_gather_raw_memory_pool +from internlm.utils.common import get_current_device + + +class FSTPOverlapHandler: + """ + FSTP overlap handler for managing the all-gather and reduce_scatter overlapping. + """ + + def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: + self.process_group = process_group + self.fstp_outs = [] + self.fstp_modules = [] + self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] + self.fstp_global_handle = dict() # key: fstp module; value: module global all-gather op handle + self.module_to_index = dict() # key: fstp module; value: transformer block index + self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules + self.head = [] + self.embedding = [] + + self.reduce_scatter_handlers = {} + self.zero_const_pool = {} + + # just want to share same for loop for ModuleList and Module + if not isinstance(model, nn.ModuleList): + model = [model] + + for _chunk in model: + if isinstance(_chunk, NaiveAMPModel): + _chunk = _chunk.model + + for _chunk_name, children in _chunk.named_children(): + if isinstance(children, ScaleColumnParallelLinear): + self.head.append(children) + elif isinstance(children, Embedding1D): + self.embedding.append(children) + elif isinstance(children, nn.ModuleList): + for idx, block in enumerate(children): + self.index_to_fstp_modules[idx] = [] + for _sub_name, sub in block.named_children(): + sub_modules = list(sub.children()) + if len(sub_modules) > 0: + for name, child in sub.named_children(): + if name == "out_proj": + self.fstp_outs.append(child) + self.module_to_index[child] = idx + if isinstance(child, FSTPLinear): + self.module_to_index[child] = idx + self.fstp_modules.append(child) + self.index_to_fstp_modules[idx].append(child) + + setattr(child, "_fstp_name", name) + + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + + self._initialize_memory_pool() + self._register_sync_parameters_hook() + + def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: + if size not in self.zero_const_pool: + self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() + + return self.zero_const_pool[size] + + def _initialize_memory_pool(self) -> None: + # allocate memory pool + hidden_size = gpc.config.HIDDEN_SIZE + mlp_ratio = gpc.config.MLP_RATIO + mlp_hidden_size = int(hidden_size * mlp_ratio) + mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) + self.all_gather_memory_pool = [] + self.reduce_scatter_memory_pool = {} + + for _ in range(2): + weight = {} + for name in self.module_name: + if name == "Wqkv": + weight[name] = torch.zeros( + (3 * hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + elif name == "out_proj": + weight[name] = torch.zeros( + (hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + elif name == "w1" or name == "w2": + weight[name] = torch.zeros( + (mlp_hidden_size, hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + else: + weight[name] = torch.zeros( + (hidden_size, mlp_hidden_size), + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + + self.all_gather_memory_pool.append(weight) # containing two groups of block weight + + def get_all_gather_memory(self, index, module_name): + return self.all_gather_memory_pool[index % 2][module_name] + + def get_reduce_scatter_memory(self, key): + return_idx = 0 + + # if key not in dict + if key not in self.reduce_scatter_memory_pool: + self.reduce_scatter_memory_pool[key] = {"data": [], "used": []} + + # if the data is empty + if len(self.reduce_scatter_memory_pool[key]["data"]) == 0: + self.reduce_scatter_memory_pool[key]["data"].append( + torch.zeros( + key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() + ).contiguous() + ) + self.reduce_scatter_memory_pool[key]["used"].append(True) + return_idx = 0 + return return_idx + else: # if not empty + for index, used in enumerate(self.reduce_scatter_memory_pool[key]["used"]): + if used is False: + self.reduce_scatter_memory_pool[key]["used"][index] = True + return_idx = index + return return_idx + # if the memory pool is all used + length = len(self.reduce_scatter_memory_pool[key]["data"]) + self.reduce_scatter_memory_pool[key]["data"].append( + torch.zeros( + key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() + ).contiguous() + ) + self.reduce_scatter_memory_pool[key]["used"].append(True) + return_idx = length + return return_idx + + def release_reduce_scatter_memory(self, size, index): + self.reduce_scatter_memory_pool[size]["used"][index] = False + + def _all_gather_block_weight_memory_pool(self, block_index: int): + fstp_modules = self.index_to_fstp_modules[block_index] + for module in fstp_modules: + weight_handle = all_gather_raw_memory_pool( + module.weight, + self.process_group, + async_op=True, + block_index=block_index, + module_name=getattr(module, "_fstp_name"), + ) + self.fstp_global_handle[module] = weight_handle + + def _register_sync_parameters_hook(self) -> None: + """ + register forward hooks and backward hooks for fstp modules. + """ + + def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): + self._all_gather_block_weight_memory_pool(0) + + def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): + block_index = self.module_to_index[module] + # start the all-gather for next block + if block_index + 1 < gpc.config.NUM_LAYER: + self._all_gather_block_weight_memory_pool(block_index + 1) + + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): + handle = self.fstp_global_handle[module] + handle.wait() + + def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): + if module in self.fstp_global_handle: + del self.fstp_global_handle[module] + + def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): + first_backward_module = self.fstp_modules[-1] + block_index = self.module_to_index[first_backward_module] + weight_handle = all_gather_raw_memory_pool( + first_backward_module.weight, + self.process_group, + async_op=True, + block_index=block_index, + module_name=getattr(first_backward_module, "_fstp_name"), + ) + self.fstp_global_handle[first_backward_module] = weight_handle + + def _pre_backward_hook_for_module(module: nn.Module, grad_output): + # wait handle for current module + weight_handle = self.fstp_global_handle[module] + weight_handle.wait() + + # start the all-gather for next module + module_index = self.fstp_modules.index(module) + if module_index - 1 >= 0: + next_module = self.fstp_modules[module_index - 1] + block_index = self.module_to_index[next_module] + weight_handle = all_gather_raw_memory_pool( + next_module.weight, + self.process_group, + async_op=True, + block_index=block_index, + module_name=getattr(next_module, "_fstp_name"), + ) + self.fstp_global_handle[next_module] = weight_handle + + def _post_backward_hook_for_module(module, grad_input, grad_output): + if module in self.fstp_global_handle: + del self.fstp_global_handle[module] + + # register forward hooks + # 1. register post_forward_hook @embedding module to prefetch for block 0 + # 2. register pre_forward_hook @out_proj module to prefetch for next block, + # notice that next block's all_gather op should be after current block's all_to_all op + # 3. register pre_forward_hook @fstp_module to wait handle for current module + # 4. register post_forward_hook @fstp_module to release resource + for embedding in self.embedding: + embedding.register_forward_hook(_post_forward_hook_for_embedding) + + for out_proj in self.fstp_outs: + out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) + + for module in self.fstp_modules: + module.register_forward_pre_hook(_pre_forward_hook_for_module) + module.register_forward_hook(_post_forward_hook_for_module) + + # register backward hooks + # 1. register post_backward_hook @head module to prefetch for the last block's last module + # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module + # 3. register post_backward_hook @fstp_module to release resource + for head in self.head: + head.register_full_backward_hook(_post_backward_hook_for_head) + + for module in self.fstp_modules: + module.register_full_backward_pre_hook(_pre_backward_hook_for_module) + module.register_full_backward_hook(_post_backward_hook_for_module) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index b1894e9f..ccdca481 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -135,7 +135,7 @@ def all_gather_raw_memory_pool( module_name: str = None, ): handle = torch.distributed.all_gather_into_tensor( - gpc.config.block_memory[block_index % 2][module_name], + gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name), input_.contiguous(), group=process_group, async_op=async_op, @@ -166,8 +166,8 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 size = (input_.shape[0] // world_size, *input_.shape[1:]) - index = check_reduce_scatter_memory_pool(size) - output = gpc.config.reduce_scatter_memory[size]["data"][index] + index = gpc.config.fstp_handler.get_reduce_scatter_memory(size) + output = gpc.config.fstp_handler.reduce_scatter_memory_pool[size]["data"][index] setattr(output, "index", index) handle = torch.distributed.reduce_scatter_tensor( output, input_.contiguous(), group=process_group, async_op=async_op @@ -269,11 +269,11 @@ def backward(ctx, grad_output, *args): class MegatronFusedDenseFunc(torch.autograd.Function): - ''' + """ FusedDenseFunc for tensor parallel in megatron implementation. The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron, so that the all-gather in backward is ommited. - ''' + """ @staticmethod @custom_fwd @@ -355,9 +355,10 @@ def backward(ctx, grad_output, *args): handle_grad_input.wait() return grad_input, grad_weight, grad_bias, None, None, None, None + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFuncTorch(FusedDenseFunc): - '''FusedDenseFunc in flash implementation for supporting torch.float32''' + """FusedDenseFunc in flash implementation for supporting torch.float32""" @staticmethod @custom_bwd @@ -407,8 +408,9 @@ def backward(ctx, grad_output, *args): handle_grad_input.wait() return grad_input, grad_weight, grad_bias, None, None, None, None + class MegatronFusedDenseFuncTorch(FusedDenseFunc): - '''FusedDenseFunc in megatron implementation for supporting torch.float32''' + """FusedDenseFunc in megatron implementation for supporting torch.float32""" @staticmethod @custom_bwd @@ -452,6 +454,7 @@ def backward(ctx, grad_output, *args): handle_grad_input.wait() return grad_input, grad_weight, grad_bias, None, None, None, None + class FSTPFusedDenseFunc(torch.autograd.Function): "FusedDenseFunc for FSTP, which is optimized based on flash implementation." @@ -485,7 +488,7 @@ def forward( if world_size > 1: # do all_gather for weight and bias before actual computation if overlap_handler is not None: - total_weight = gpc.config.block_memory[block_index % 2][module_name] + total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -544,7 +547,7 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: if overlap_handler is not None: - total_weight = gpc.config.block_memory[block_index % 2][module_name] + total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -559,17 +562,39 @@ def backward(ctx, grad_output, *args): ) if world_size > 1: if overlap_handler is not None: - grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True) + grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( + grad_weight, process_group, async_op=True + ) assert hasattr(weight, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async) - grad_weight = overlap_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device) + overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( + handle_grad_weight, + grad_weight_async, + ) + grad_weight = overlap_handler.get_zero_by_shape( + ( + grad_weight.shape[0] // torch.distributed.get_world_size(process_group), + *grad_weight.shape[1:], + ), + dtype=grad_weight.dtype, + device=grad_weight.device, + ) if grad_bias is not None: grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( grad_bias, process_group, async_op=True ) assert hasattr(bias, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async) - grad_bias = overlap_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device) + overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( + handle_grad_bias, + grad_bias_async, + ) + grad_bias = overlap_handler.get_zero_by_shape( + ( + grad_bias.shape[0] // torch.distributed.get_world_size(process_group), + *grad_bias.shape[1:], + ), + dtype=grad_bias.dtype, + device=grad_bias.device, + ) else: grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) if grad_bias is not None: @@ -613,6 +638,7 @@ def fused_dense_func_torch( else: return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + def megatron_fused_dense_func_torch( x: Tensor, weight: Tensor, @@ -626,9 +652,14 @@ def megatron_fused_dense_func_torch( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return MegatronFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + return MegatronFusedDenseFunc.apply( + x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim + ) else: - return MegatronFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + return MegatronFusedDenseFuncTorch.apply( + x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim + ) + def fstp_fused_dense_func( x: Tensor, @@ -693,38 +724,3 @@ def Silu(w1_o, w2_o): Silu = torch.jit.script(Silu) - - -def check_reduce_scatter_memory_pool(key): - return_idx = 0 - - # if key not in dict - if key not in gpc.config.reduce_scatter_memory: - gpc.config.reduce_scatter_memory[key] = {"data": [], "used": []} - - # if the data is empty - if len(gpc.config.reduce_scatter_memory[key]["data"]) == 0: - gpc.config.reduce_scatter_memory[key]["data"].append( - torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() - ) - gpc.config.reduce_scatter_memory[key]["used"].append(True) - return_idx = 0 - return return_idx - else: # if not empty - for index, used in enumerate(gpc.config.reduce_scatter_memory[key]["used"]): - if used is False: - gpc.config.reduce_scatter_memory[key]["used"][index] = True - return_idx = index - return return_idx - # if the memory pool is all used - length = len(gpc.config.reduce_scatter_memory[key]["data"]) - gpc.config.reduce_scatter_memory[key]["data"].append( - torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() - ) - gpc.config.reduce_scatter_memory[key]["used"].append(True) - return_idx = length - return return_idx - - -def release_reduce_scatter_memory_pool(size, index): - gpc.config.reduce_scatter_memory[size]["used"][index] = False diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 0f536ec5..e2ec7efd 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -11,7 +11,6 @@ from internlm.core.context import Config, ParallelMode from internlm.core.context import global_context as gpc -from internlm.model.utils import release_reduce_scatter_memory_pool from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -41,6 +40,7 @@ inf = math.inf logger = get_logger(__file__) + class HybridZeroOptimizer(BaseOptimizer): """ Hybrid Zero Optimizer. @@ -65,7 +65,7 @@ def __init__( backoff_factor = grad_scal_cfg.backoff_factor hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - + self._fstp_handler = None if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: self._fstp_handler = gpc.config.fstp_handler @@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona _param.grad.add_(_grad) # release cuda memory. - release_reduce_scatter_memory_pool(size=tuple(_grad.size()), index=_grad.index) + gpc.config.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index) self._fstp_handler.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) @@ -635,9 +635,9 @@ def step(self, closure=None): timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() - - res = self._step(closure=closure, norms=total_norms) - + + res = self._step(closure=closure, norms=total_norms) + return res def _step(self, closure=None, norms=None): diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 53996b38..cabb7ebd 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -36,12 +36,12 @@ from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data from internlm.model.embedding import Embedding1D from internlm.model.linear import ( - CoarseGrainedFSTPAllGatherSyncHandler, FeedForward, RewardModelLinear, ScaleColumnParallelLinear, ) from internlm.model.multi_head_attention import MHA +from internlm.model.overlap_handler import FSTPOverlapHandler from internlm.model.utils import try_import_RMSNorm from internlm.monitor import send_heartbeat, set_env_var from internlm.monitor.monitor import monitor_manager as mm @@ -109,60 +109,8 @@ def initialize_model(): model = wrap_FSDP_model(model) gpc.config.fstp_handler = None - if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: - handler = CoarseGrainedFSTPAllGatherSyncHandler(model, gpc.get_group(ParallelMode.TENSOR)) - handler._register_sync_parameters_hook() - gpc.config.fstp_handler = handler - - # allocate memory pool - block_memory = {} # containing two groups of block weight - hidden_size = gpc.config.HIDDEN_SIZE - mlp_ratio = gpc.config.MLP_RATIO - mlp_hidden_size = int(hidden_size * mlp_ratio) - mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) - world_size = gpc.get_world_size(ParallelMode.TENSOR) - size_key = [ - (3 * hidden_size // world_size, hidden_size), - (mlp_hidden_size // world_size, hidden_size), - (hidden_size // world_size, mlp_hidden_size), - (hidden_size // world_size, hidden_size), - ] - module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - for i in range(2): - weight = {} - for name in module_name: - if name == "Wqkv": - weight[name] = torch.zeros( - (3 * hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - elif name == "out_proj": - weight[name] = torch.zeros( - (hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - elif name == "w1" or name == "w2": - weight[name] = torch.zeros( - (mlp_hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - else: - weight[name] = torch.zeros( - (hidden_size, mlp_hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - block_memory[i] = weight - reduce_scatter_memory = {} - for key in size_key: - reduce_scatter_memory[key] = {"data": [], "used": []} - - gpc.config.block_memory = block_memory - gpc.config.reduce_scatter_memory = reduce_scatter_memory + gpc.config.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR)) return model diff --git a/train.py b/train.py index 02f28028..5066960e 100644 --- a/train.py +++ b/train.py @@ -299,7 +299,7 @@ def main(args): if gpc.config.fstp_handler is not None: gpc.config.fstp_handler.zero_const_pool = {} - gpc.config.fstp_handler.reduce_scatter_memory = {} + gpc.config.fstp_handler.reduce_scatter_memory_pool = {} # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() From b20f47a1fe5fb446f2d9df5a83b31cb6033579f0 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 12:02:32 +0800 Subject: [PATCH 042/153] feat(model/overlap_handler.py): move handler to gpc --- internlm/model/linear.py | 5 +--- internlm/model/overlap_handler.py | 16 ++++------ internlm/model/utils.py | 29 ++++++------------- .../solver/optimizer/hybrid_zero_optim.py | 4 +-- internlm/train/training_internlm.py | 4 +-- train.py | 6 ++-- 6 files changed, 23 insertions(+), 41 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 6cd3b9c8..b92b2ee5 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -352,16 +352,13 @@ def __init__( class FSTPLinear(ColumnParallelLinear): def forward(self, x): - block_index = gpc.config.fstp_handler.module_to_index[self] return fstp_fused_dense_func( x, self.weight, self.bias, process_group=self.process_group, module=self, - handler=gpc.config.fstp_handler, - block_index=block_index, - module_name=self._fstp_name, + handler=gpc.fstp_handler, ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index cafb8183..b6877234 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -116,8 +116,9 @@ def _initialize_memory_pool(self) -> None: self.all_gather_memory_pool.append(weight) # containing two groups of block weight - def get_all_gather_memory(self, index, module_name): - return self.all_gather_memory_pool[index % 2][module_name] + def get_all_gather_memory(self, module): + block_index = self.module_to_index[module] + return self.all_gather_memory_pool[block_index % 2][module._fstp_name] def get_reduce_scatter_memory(self, key): return_idx = 0 @@ -163,8 +164,7 @@ def _all_gather_block_weight_memory_pool(self, block_index: int): module.weight, self.process_group, async_op=True, - block_index=block_index, - module_name=getattr(module, "_fstp_name"), + module=module, ) self.fstp_global_handle[module] = weight_handle @@ -192,13 +192,11 @@ def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): first_backward_module = self.fstp_modules[-1] - block_index = self.module_to_index[first_backward_module] weight_handle = all_gather_raw_memory_pool( first_backward_module.weight, self.process_group, async_op=True, - block_index=block_index, - module_name=getattr(first_backward_module, "_fstp_name"), + module=first_backward_module, ) self.fstp_global_handle[first_backward_module] = weight_handle @@ -211,13 +209,11 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): module_index = self.fstp_modules.index(module) if module_index - 1 >= 0: next_module = self.fstp_modules[module_index - 1] - block_index = self.module_to_index[next_module] weight_handle = all_gather_raw_memory_pool( next_module.weight, self.process_group, async_op=True, - block_index=block_index, - module_name=getattr(next_module, "_fstp_name"), + module=next_module, ) self.fstp_global_handle[next_module] = weight_handle diff --git a/internlm/model/utils.py b/internlm/model/utils.py index ccdca481..cdbed954 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -7,13 +7,12 @@ import torch import torch.nn.functional as F from flash_attn.utils.distributed import all_reduce_raw -from torch import Tensor +from torch import Tensor, nn from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.utils.common import get_current_device from internlm.utils.logger import get_logger logger = get_logger(__file__) @@ -131,11 +130,10 @@ def all_gather_raw_memory_pool( process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0, - block_index: int = None, - module_name: str = None, + module: nn.Module = None, ): handle = torch.distributed.all_gather_into_tensor( - gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name), + gpc.fstp_handler.get_all_gather_memory(module=module), input_.contiguous(), group=process_group, async_op=async_op, @@ -166,8 +164,8 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 size = (input_.shape[0] // world_size, *input_.shape[1:]) - index = gpc.config.fstp_handler.get_reduce_scatter_memory(size) - output = gpc.config.fstp_handler.reduce_scatter_memory_pool[size]["data"][index] + index = gpc.fstp_handler.get_reduce_scatter_memory(size) + output = gpc.fstp_handler.reduce_scatter_memory_pool[size]["data"][index] setattr(output, "index", index) handle = torch.distributed.reduce_scatter_tensor( output, input_.contiguous(), group=process_group, async_op=async_op @@ -469,16 +467,12 @@ def forward( process_group=None, module=None, overlap_handler=None, - block_index=None, - module_name=None, ): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.process_group = process_group ctx.overlap_handler = overlap_handler ctx.module = module - ctx.block_index = block_index - ctx.module_name = module_name if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -488,7 +482,7 @@ def forward( if world_size > 1: # do all_gather for weight and bias before actual computation if overlap_handler is not None: - total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name) + total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -531,8 +525,7 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.contiguous() process_group = ctx.process_group overlap_handler = ctx.overlap_handler - block_index = ctx.block_index - module_name = ctx.module_name + module = ctx.module if ctx.compute_weight_gradient: x, weight, bias = ctx.saved_tensors @@ -547,7 +540,7 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: if overlap_handler is not None: - total_weight = gpc.config.fstp_handler.get_all_gather_memory(block_index, module_name) + total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -669,16 +662,12 @@ def fstp_fused_dense_func( process_group=None, module=None, handler=None, - block_index=None, - module_name=None, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSTPFusedDenseFunc.apply( - x, weight, bias, return_residual, process_group, module, handler, block_index, module_name - ) + return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler) else: assert process_group is None out = F.linear(x, weight, bias) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index e2ec7efd..08d97229 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -68,7 +68,7 @@ def __init__( self._fstp_handler = None if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: - self._fstp_handler = gpc.config.fstp_handler + self._fstp_handler = gpc.fstp_handler # Zero related args reduce_bucket_size = zero_cfg.reduce_bucket_size @@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona _param.grad.add_(_grad) # release cuda memory. - gpc.config.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index) + gpc.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index) self._fstp_handler.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index cabb7ebd..b05611bc 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -108,9 +108,9 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - gpc.config.fstp_handler = None + gpc.fstp_handler = None if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: - gpc.config.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR)) + gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR)) return model diff --git a/train.py b/train.py index 5066960e..96dc24d1 100644 --- a/train.py +++ b/train.py @@ -297,9 +297,9 @@ def main(args): prof.step() - if gpc.config.fstp_handler is not None: - gpc.config.fstp_handler.zero_const_pool = {} - gpc.config.fstp_handler.reduce_scatter_memory_pool = {} + if gpc.fstp_handler is not None: + gpc.fstp_handler.zero_const_pool = {} + gpc.fstp_handler.reduce_scatter_memory_pool = {} # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() From e7f9f1d20853e856f175d178bf94350871744b67 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 13:31:23 +0800 Subject: [PATCH 043/153] feat(model/overlap_handler.py): optimize reduce scatter mem pool --- internlm/model/overlap_handler.py | 35 ++++++++++--------- internlm/model/utils.py | 4 +-- .../solver/optimizer/hybrid_zero_optim.py | 2 +- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index b6877234..b3c8b8b0 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -125,37 +125,38 @@ def get_reduce_scatter_memory(self, key): # if key not in dict if key not in self.reduce_scatter_memory_pool: - self.reduce_scatter_memory_pool[key] = {"data": [], "used": []} + self.reduce_scatter_memory_pool[key] = [] # if the data is empty - if len(self.reduce_scatter_memory_pool[key]["data"]) == 0: - self.reduce_scatter_memory_pool[key]["data"].append( + if len(self.reduce_scatter_memory_pool[key]) == 0: + self.reduce_scatter_memory_pool[key].append( torch.zeros( key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() ).contiguous() ) - self.reduce_scatter_memory_pool[key]["used"].append(True) - return_idx = 0 - return return_idx + setattr(self.reduce_scatter_memory_pool[key][return_idx], "idle", False) + setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx) + return self.reduce_scatter_memory_pool[key][return_idx] else: # if not empty - for index, used in enumerate(self.reduce_scatter_memory_pool[key]["used"]): - if used is False: - self.reduce_scatter_memory_pool[key]["used"][index] = True + for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]): + if mem_item.idle is True: + self.reduce_scatter_memory_pool[key][index].idle = False return_idx = index - return return_idx + return self.reduce_scatter_memory_pool[key][return_idx] # if the memory pool is all used - length = len(self.reduce_scatter_memory_pool[key]["data"]) - self.reduce_scatter_memory_pool[key]["data"].append( + cur_len = len(self.reduce_scatter_memory_pool[key]) + self.reduce_scatter_memory_pool[key].append( torch.zeros( key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() ).contiguous() ) - self.reduce_scatter_memory_pool[key]["used"].append(True) - return_idx = length - return return_idx + setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False) + return_idx = cur_len + setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx) + return self.reduce_scatter_memory_pool[key][return_idx] - def release_reduce_scatter_memory(self, size, index): - self.reduce_scatter_memory_pool[size]["used"][index] = False + def release_reduce_scatter_memory(self, key, index): + self.reduce_scatter_memory_pool[key][index].idle = True def _all_gather_block_weight_memory_pool(self, block_index: int): fstp_modules = self.index_to_fstp_modules[block_index] diff --git a/internlm/model/utils.py b/internlm/model/utils.py index cdbed954..8070cbdc 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -164,9 +164,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 size = (input_.shape[0] // world_size, *input_.shape[1:]) - index = gpc.fstp_handler.get_reduce_scatter_memory(size) - output = gpc.fstp_handler.reduce_scatter_memory_pool[size]["data"][index] - setattr(output, "index", index) + output = gpc.fstp_handler.get_reduce_scatter_memory(size) handle = torch.distributed.reduce_scatter_tensor( output, input_.contiguous(), group=process_group, async_op=async_op ) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 08d97229..0d0c8a3b 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona _param.grad.add_(_grad) # release cuda memory. - gpc.fstp_handler.release_reduce_scatter_memory(size=tuple(_grad.size()), index=_grad.index) + gpc.fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) self._fstp_handler.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) From f6a5086fe4203727ed96ce4444493a080d91b74d Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 23 Oct 2023 14:51:27 +0800 Subject: [PATCH 044/153] support bias --- internlm/model/overlap_handler.py | 85 ++++++++++++++++++++----------- internlm/model/utils.py | 22 +++++++- 2 files changed, 75 insertions(+), 32 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index b3c8b8b0..f7132c3b 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -10,7 +10,7 @@ from internlm.core.naive_amp import NaiveAMPModel from internlm.model.embedding import Embedding1D from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear -from internlm.model.utils import all_gather_raw_memory_pool +from internlm.model.utils import all_gather_raw_memory_pool, all_gather_raw_bias_memory_pool from internlm.utils.common import get_current_device @@ -25,6 +25,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.fstp_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] self.fstp_global_handle = dict() # key: fstp module; value: module global all-gather op handle + self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle self.module_to_index = dict() # key: fstp module; value: transformer block index self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules self.head = [] @@ -76,49 +77,61 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() return self.zero_const_pool[size] - - def _initialize_memory_pool(self) -> None: - # allocate memory pool + + def _initialize_module_shape(self): hidden_size = gpc.config.HIDDEN_SIZE mlp_ratio = gpc.config.MLP_RATIO mlp_hidden_size = int(hidden_size * mlp_ratio) mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) + + self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size) + self.module_shape["out_proj"] = (hidden_size, hidden_size) + self.module_shape["w1"] = (mlp_hidden_size, hidden_size) + self.module_shape["w2"] = (mlp_hidden_size, hidden_size) + self.module_shape["w3"] = (hidden_size, mlp_hidden_size) + + def _initialize_memory_pool(self) -> None: + # allocate memory pool self.all_gather_memory_pool = [] + self.all_gather_bias_memory_pool = [] self.reduce_scatter_memory_pool = {} + self.module_shape = {} + + self._initialize_module_shape() + dtype = gpc.config.model.get("dtype", torch.half) + device = get_current_device() for _ in range(2): weight = {} for name in self.module_name: - if name == "Wqkv": - weight[name] = torch.zeros( - (3 * hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - elif name == "out_proj": - weight[name] = torch.zeros( - (hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - elif name == "w1" or name == "w2": - weight[name] = torch.zeros( - (mlp_hidden_size, hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - else: - weight[name] = torch.zeros( - (hidden_size, mlp_hidden_size), - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - + weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous() self.all_gather_memory_pool.append(weight) # containing two groups of block weight def get_all_gather_memory(self, module): block_index = self.module_to_index[module] return self.all_gather_memory_pool[block_index % 2][module._fstp_name] + + def get_bias_memory(self, module: nn.Module): + block_index = self.module_to_index[module] + # if the bias memory pool is empty or module has been not allocated memory + # import pdb; pdb.set_trace() + if len(self.all_gather_bias_memory_pool) == 0: + for _ in range(2): + weight = {} + weight[module._fstp_name] = torch.zeros( + self.module_shape[module._fstp_name][0], + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device()).contiguous() + self.all_gather_bias_memory_pool.append(weight) + elif module._fstp_name not in self.all_gather_bias_memory_pool[0]: + for i in range(2): + self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros( + self.module_shape[module._fstp_name][0], + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device()).contiguous() + + return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] + def get_reduce_scatter_memory(self, key): return_idx = 0 @@ -157,10 +170,19 @@ def get_reduce_scatter_memory(self, key): def release_reduce_scatter_memory(self, key, index): self.reduce_scatter_memory_pool[key][index].idle = True - + def _all_gather_block_weight_memory_pool(self, block_index: int): fstp_modules = self.index_to_fstp_modules[block_index] for module in fstp_modules: + if module.bias is not None: + bias_handle = all_gather_raw_bias_memory_pool( + module.bias, + self.process_group, + async_op=True, + module=module, + ) + self.bias_global_handle[module] = bias_handle + weight_handle = all_gather_raw_memory_pool( module.weight, self.process_group, @@ -186,6 +208,9 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): handle = self.fstp_global_handle[module] handle.wait() + if module.bias is not None: + bias_handle = self.bias_global_handle[module] + bias_handle.wait() def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): if module in self.fstp_global_handle: diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 8070cbdc..8a1281e8 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -140,6 +140,21 @@ def all_gather_raw_memory_pool( ) return handle +def all_gather_raw_bias_memory_pool( + input_: Tensor, + process_group: ProcessGroup, + async_op: bool = False, + gather_dim: int = 0, + module: nn.Module = None, +): + handle = torch.distributed.all_gather_into_tensor( + gpc.fstp_handler.get_bias_memory(module=module), + input_.contiguous(), + group=process_group, + async_op=async_op, + ) + return handle + def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): assert my_input.dtype == grad_output.dtype @@ -486,8 +501,11 @@ def forward( handle_weight.wait() # TODO memory pool for bias if bias is not None: - total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - handle_bias.wait() + if overlap_handler is not None: + total_bias = gpc.fstp_handler.get_bias_memory(module=module) + else: + total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) + handle_bias.wait() else: total_bias = bias else: From 0d693cf3a182b34cc9af7b6ef640f250ff7abbda Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 15:22:03 +0800 Subject: [PATCH 045/153] feat(model/overlap_handler.py): fix lint error --- internlm/model/moe.py | 1 - internlm/model/overlap_handler.py | 40 ++++++++++++++++++------------- internlm/model/utils.py | 1 + train.py | 3 +-- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/internlm/model/moe.py b/internlm/model/moe.py index 28e5ae6e..0865097f 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -53,7 +53,6 @@ def __init__( device=None, dtype=None, ): - super().__init__() assert ( diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index f7132c3b..3f7ee055 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -10,7 +10,10 @@ from internlm.core.naive_amp import NaiveAMPModel from internlm.model.embedding import Embedding1D from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear -from internlm.model.utils import all_gather_raw_memory_pool, all_gather_raw_bias_memory_pool +from internlm.model.utils import ( + all_gather_raw_bias_memory_pool, + all_gather_raw_memory_pool, +) from internlm.utils.common import get_current_device @@ -25,7 +28,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.fstp_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] self.fstp_global_handle = dict() # key: fstp module; value: module global all-gather op handle - self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle + self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle self.module_to_index = dict() # key: fstp module; value: transformer block index self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules self.head = [] @@ -77,13 +80,13 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() return self.zero_const_pool[size] - + def _initialize_module_shape(self): hidden_size = gpc.config.HIDDEN_SIZE mlp_ratio = gpc.config.MLP_RATIO mlp_hidden_size = int(hidden_size * mlp_ratio) mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) - + self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size) self.module_shape["out_proj"] = (hidden_size, hidden_size) self.module_shape["w1"] = (mlp_hidden_size, hidden_size) @@ -96,7 +99,7 @@ def _initialize_memory_pool(self) -> None: self.all_gather_bias_memory_pool = [] self.reduce_scatter_memory_pool = {} self.module_shape = {} - + self._initialize_module_shape() dtype = gpc.config.model.get("dtype", torch.half) device = get_current_device() @@ -107,10 +110,14 @@ def _initialize_memory_pool(self) -> None: weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous() self.all_gather_memory_pool.append(weight) # containing two groups of block weight + def clear_memory_pool(self) -> None: + self.zero_const_pool = {} + self.reduce_scatter_memory_pool = {} + def get_all_gather_memory(self, module): block_index = self.module_to_index[module] return self.all_gather_memory_pool[block_index % 2][module._fstp_name] - + def get_bias_memory(self, module: nn.Module): block_index = self.module_to_index[module] # if the bias memory pool is empty or module has been not allocated memory @@ -119,19 +126,20 @@ def get_bias_memory(self, module: nn.Module): for _ in range(2): weight = {} weight[module._fstp_name] = torch.zeros( - self.module_shape[module._fstp_name][0], - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() + self.module_shape[module._fstp_name][0], + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() self.all_gather_bias_memory_pool.append(weight) elif module._fstp_name not in self.all_gather_bias_memory_pool[0]: for i in range(2): self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros( - self.module_shape[module._fstp_name][0], - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device()).contiguous() - + self.module_shape[module._fstp_name][0], + dtype=gpc.config.model.get("dtype", torch.half), + device=get_current_device(), + ).contiguous() + return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] - def get_reduce_scatter_memory(self, key): return_idx = 0 @@ -170,7 +178,7 @@ def get_reduce_scatter_memory(self, key): def release_reduce_scatter_memory(self, key, index): self.reduce_scatter_memory_pool[key][index].idle = True - + def _all_gather_block_weight_memory_pool(self, block_index: int): fstp_modules = self.index_to_fstp_modules[block_index] for module in fstp_modules: @@ -182,7 +190,7 @@ def _all_gather_block_weight_memory_pool(self, block_index: int): module=module, ) self.bias_global_handle[module] = bias_handle - + weight_handle = all_gather_raw_memory_pool( module.weight, self.process_group, diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 8a1281e8..42a84003 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -140,6 +140,7 @@ def all_gather_raw_memory_pool( ) return handle + def all_gather_raw_bias_memory_pool( input_: Tensor, process_group: ProcessGroup, diff --git a/train.py b/train.py index 96dc24d1..b4f2a6d2 100644 --- a/train.py +++ b/train.py @@ -298,8 +298,7 @@ def main(args): prof.step() if gpc.fstp_handler is not None: - gpc.fstp_handler.zero_const_pool = {} - gpc.fstp_handler.reduce_scatter_memory_pool = {} + gpc.fstp_handler.clear_memory_pool() # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() From 03cc7f9b80bc94c4b3234da8d32674189c66aa5f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 15:28:34 +0800 Subject: [PATCH 046/153] feat(model/overlap_handler.py): fix lint error --- internlm/model/overlap_handler.py | 14 +++++++------- internlm/model/utils.py | 7 ++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 3f7ee055..6870fe68 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -204,27 +204,27 @@ def _register_sync_parameters_hook(self) -> None: register forward hooks and backward hooks for fstp modules. """ - def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): + def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 self._all_gather_block_weight_memory_pool(0) - def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): + def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # pylint: disable=W0613 block_index = self.module_to_index[module] # start the all-gather for next block if block_index + 1 < gpc.config.NUM_LAYER: self._all_gather_block_weight_memory_pool(block_index + 1) - def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): + def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: disable=W0613 handle = self.fstp_global_handle[module] handle.wait() if module.bias is not None: bias_handle = self.bias_global_handle[module] bias_handle.wait() - def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): + def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 if module in self.fstp_global_handle: del self.fstp_global_handle[module] - def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): + def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # pylint: disable=W0613 first_backward_module = self.fstp_modules[-1] weight_handle = all_gather_raw_memory_pool( first_backward_module.weight, @@ -234,7 +234,7 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): ) self.fstp_global_handle[first_backward_module] = weight_handle - def _pre_backward_hook_for_module(module: nn.Module, grad_output): + def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 # wait handle for current module weight_handle = self.fstp_global_handle[module] weight_handle.wait() @@ -251,7 +251,7 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): ) self.fstp_global_handle[next_module] = weight_handle - def _post_backward_hook_for_module(module, grad_input, grad_output): + def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: disable=W0613 if module in self.fstp_global_handle: del self.fstp_global_handle[module] diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 42a84003..982c0e08 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -129,7 +129,6 @@ def all_gather_raw_memory_pool( input_: Tensor, process_group: ProcessGroup, async_op: bool = False, - gather_dim: int = 0, module: nn.Module = None, ): handle = torch.distributed.all_gather_into_tensor( @@ -145,7 +144,6 @@ def all_gather_raw_bias_memory_pool( input_: Tensor, process_group: ProcessGroup, async_op: bool = False, - gather_dim: int = 0, module: nn.Module = None, ): handle = torch.distributed.all_gather_into_tensor( @@ -283,8 +281,8 @@ def backward(ctx, grad_output, *args): class MegatronFusedDenseFunc(torch.autograd.Function): """ FusedDenseFunc for tensor parallel in megatron implementation. - The diffenrence between the implementation of flash-attn and megatron is that the total_x could be saved for backward in megatron, - so that the all-gather in backward is ommited. + The diffenrence between the implementation of flash-attn and megatron is that the total_x could be + saved for backward in megatron, so that the all-gather in backward is ommited. """ @staticmethod @@ -433,7 +431,6 @@ def backward(ctx, grad_output, *args): grad_input = grad_input.contiguous() process_group = ctx.process_group sequence_parallel = ctx.sequence_parallel - gather_dim = ctx.gather_dim if ctx.compute_weight_gradient: total_x, weight = ctx.saved_tensors else: From 9cf1ff0f6e8a3db1dd1e61fd7b91a056b13041ef Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 15:31:41 +0800 Subject: [PATCH 047/153] feat(solver/optimizer/hybrid_zero_optim.py): minor update --- internlm/solver/optimizer/hybrid_zero_optim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 0d0c8a3b..d2c894c9 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -350,7 +350,7 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona _param.grad.add_(_grad) # release cuda memory. - gpc.fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) + self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) self._fstp_handler.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) From b2c1a70477bff8e266dcb3155c2f794dfd7cbf5f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 23 Oct 2023 15:34:24 +0800 Subject: [PATCH 048/153] feat(train/training_internlm.py): fix lint error --- internlm/train/training_internlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index b05611bc..5e874d39 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -50,7 +50,7 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile, get_current_device +from internlm.utils.common import DummyProfile from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp From 0996c47e49bf967aeff2aa83326a77fdcbdd9b64 Mon Sep 17 00:00:00 2001 From: "chenxun.p" Date: Mon, 23 Oct 2023 16:17:57 +0800 Subject: [PATCH 049/153] fix accumulate grads bug --- internlm/solver/optimizer/hybrid_zero_optim.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index d2c894c9..247f8212 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -590,14 +590,14 @@ def step(self, closure=None): if param.grad is not None: self._store_and_try_reduce_grads_by_bucket(param) - # we need to reduce the gradients left in the communication bucket - for group_id in range(self.num_param_groups): - self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) - # we need to accumulate gradients left in the accumulate gardient bucket for group_id in range(self.num_param_groups): self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None) + # we need to reduce the gradients left in the communication bucket + for group_id in range(self.num_param_groups): + self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) + # compute norm for gradients in the before bucket groups_norms = [] for group_id in range(self.num_param_groups): From 97dcefc3892bea11106a1d0dd6f554a1eea291f0 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 24 Oct 2023 16:13:52 +0800 Subject: [PATCH 050/153] support model activation checkpoint --- internlm/core/scheduler/__init__.py | 3 +- internlm/core/scheduler/base_scheduler.py | 41 +------------- internlm/model/metrics.py | 42 +++++++++++++- internlm/model/overlap_handler.py | 67 ++++++++++++++++++++--- internlm/utils/evaluation.py | 3 +- train.py | 43 ++++++++++----- 6 files changed, 131 insertions(+), 68 deletions(-) diff --git a/internlm/core/scheduler/__init__.py b/internlm/core/scheduler/__init__.py index a9bf013f..ea6afcd4 100644 --- a/internlm/core/scheduler/__init__.py +++ b/internlm/core/scheduler/__init__.py @@ -1,4 +1,4 @@ -from .base_scheduler import BaseScheduler, SchedulerHook, SchedulerMetricHook +from .base_scheduler import BaseScheduler, SchedulerHook from .no_pipeline_scheduler import NonPipelineScheduler from .pipeline_scheduler import InterleavedPipelineScheduler, PipelineScheduler @@ -8,5 +8,4 @@ "InterleavedPipelineScheduler", "PipelineScheduler", "SchedulerHook", - "SchedulerMetricHook", ] diff --git a/internlm/core/scheduler/base_scheduler.py b/internlm/core/scheduler/base_scheduler.py index 20b44601..fbd878ce 100644 --- a/internlm/core/scheduler/base_scheduler.py +++ b/internlm/core/scheduler/base_scheduler.py @@ -4,12 +4,11 @@ # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine from abc import ABC, abstractmethod -from typing import Any, Callable, Iterable, Optional +from typing import Any, Callable, Iterable import torch from internlm.core.engine import Engine -from internlm.utils.megatron_timers import megatron_timer as timer class BaseScheduler(ABC): @@ -147,41 +146,3 @@ def after_backward(self, scheduler, inputs_grad) -> None: @abstractmethod def post_helper_func(self, scheduler, outputs, label) -> None: """A post helper function""" - - -class SchedulerMetricHook(SchedulerHook): - """ - Scheduler Metric Hook. - """ - - def __init__(self, metric: Optional[Callable] = None, skip: bool = False) -> None: - self._post_func = metric - self._skip = skip - - def before_forward(self, scheduler, inputs) -> None: - if not self._skip: - timer("fwd").start() - - def after_forward(self, scheduler, outputs) -> None: - if not self._skip: - timer("fwd").stop() - - def before_criterion(self, scheduler, outputs, label) -> None: - if not self._skip: - timer("cal_loss").start() - - def after_criterion(self, scheduler, loss) -> None: - if not self._skip: - timer("cal_loss").stop() - - def before_backward(self, scheduler, outputs, outputs_grad) -> None: - if not self._skip: - timer("bwd").start() - - def after_backward(self, scheduler, inputs_grad) -> None: - if not self._skip: - timer("bwd").stop() - - def post_helper_func(self, scheduler, outputs, label) -> None: - if self._post_func is not None: - self._post_func(outputs, label) diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py index 3a77f8b8..c32d8294 100644 --- a/internlm/model/metrics.py +++ b/internlm/model/metrics.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Callable, List, Optional import torch from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss @@ -6,6 +6,8 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc +from internlm.core.scheduler import SchedulerHook +from internlm.utils.megatron_timers import megatron_timer as timer class AccPerplex: @@ -260,3 +262,41 @@ def get_metric(self, reset=True): self.ds_token_num.fill_(0.0) return res + + +class SchedulerMetricHook(SchedulerHook): + """ + Scheduler Metric Hook. + """ + + def __init__(self, metric: Optional[Callable] = None, skip: bool = False) -> None: + self._post_func = metric + self._skip = skip + + def before_forward(self, scheduler, inputs) -> None: + if not self._skip: + timer("fwd").start() + + def after_forward(self, scheduler, outputs) -> None: + if not self._skip: + timer("fwd").stop() + + def before_criterion(self, scheduler, outputs, label) -> None: + if not self._skip: + timer("cal_loss").start() + + def after_criterion(self, scheduler, loss) -> None: + if not self._skip: + timer("cal_loss").stop() + + def before_backward(self, scheduler, outputs, outputs_grad) -> None: + if not self._skip: + timer("bwd").start() + + def after_backward(self, scheduler, inputs_grad) -> None: + if not self._skip: + timer("bwd").stop() + + def post_helper_func(self, scheduler, outputs, label) -> None: + if self._post_func is not None: + self._post_func(outputs, label) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 6870fe68..098fc8c8 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -8,6 +8,7 @@ from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel +from internlm.core.scheduler import SchedulerHook from internlm.model.embedding import Embedding1D from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear from internlm.model.utils import ( @@ -33,6 +34,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules self.head = [] self.embedding = [] + self.model_checkpoint = gpc.config.model.checkpoint + self.is_forward = True self.reduce_scatter_handlers = {} self.zero_const_pool = {} @@ -81,6 +84,9 @@ def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: return self.zero_const_pool[size] + def set_forward_mode(self, flag): + self.is_forward = flag + def _initialize_module_shape(self): hidden_size = gpc.config.HIDDEN_SIZE mlp_ratio = gpc.config.MLP_RATIO @@ -121,7 +127,6 @@ def get_all_gather_memory(self, module): def get_bias_memory(self, module: nn.Module): block_index = self.module_to_index[module] # if the bias memory pool is empty or module has been not allocated memory - # import pdb; pdb.set_trace() if len(self.all_gather_bias_memory_pool) == 0: for _ in range(2): weight = {} @@ -209,9 +214,13 @@ def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # pylint: disable=W0613 block_index = self.module_to_index[module] - # start the all-gather for next block - if block_index + 1 < gpc.config.NUM_LAYER: - self._all_gather_block_weight_memory_pool(block_index + 1) + if self.model_checkpoint and self.is_forward is False: + if block_index - 1 >= 0: + self._all_gather_block_weight_memory_pool(block_index - 1) + else: + # start the all-gather for next block + if block_index + 1 < gpc.config.NUM_LAYER: + self._all_gather_block_weight_memory_pool(block_index + 1) def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: disable=W0613 handle = self.fstp_global_handle[module] @@ -234,6 +243,9 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # ) self.fstp_global_handle[first_backward_module] = weight_handle + def _pre_backward_hook_for_head(module: nn.Module, grad_output): + self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1) + def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 # wait handle for current module weight_handle = self.fstp_global_handle[module] @@ -264,6 +276,10 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: for embedding in self.embedding: embedding.register_forward_hook(_post_forward_hook_for_embedding) + if self.model_checkpoint and self.is_forward is False: + for head in self.head: + head.register_full_backward_pre_hook(_pre_backward_hook_for_head) + for out_proj in self.fstp_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) @@ -275,9 +291,42 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: # 1. register post_backward_hook @head module to prefetch for the last block's last module # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module # 3. register post_backward_hook @fstp_module to release resource - for head in self.head: - head.register_full_backward_hook(_post_backward_hook_for_head) + if gpc.config.model.checkpoint is False: + for head in self.head: + head.register_full_backward_hook(_post_backward_hook_for_head) - for module in self.fstp_modules: - module.register_full_backward_pre_hook(_pre_backward_hook_for_module) - module.register_full_backward_hook(_post_backward_hook_for_module) + for module in self.fstp_modules: + module.register_full_backward_pre_hook(_pre_backward_hook_for_module) + module.register_full_backward_hook(_post_backward_hook_for_module) + + +class FSTPOverlapSchedulerHook(SchedulerHook): + """ + SchedulerHook for fstp overlap handler + """ + + def __init__(self, overlap_handler: FSTPOverlapHandler) -> None: + super().__init__() + + self._overlap_handler = overlap_handler + + def before_forward(self, scheduler, inputs) -> None: + self._overlap_handler.set_forward_mode(True) + + def after_forward(self, scheduler, outputs) -> None: + pass + + def before_criterion(self, scheduler, outputs, label) -> None: + pass + + def after_criterion(self, scheduler, loss) -> None: + pass + + def before_backward(self, scheduler, outputs, outputs_grad) -> None: + self._overlap_handler.set_forward_mode(False) + + def after_backward(self, scheduler, inputs_grad) -> None: + pass + + def post_helper_func(self, scheduler, outputs, label) -> None: + pass diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index f708fa78..c6e27a68 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -6,8 +6,7 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.core.scheduler import SchedulerMetricHook -from internlm.model.metrics import AccPerplex +from internlm.model.metrics import AccPerplex, SchedulerMetricHook @contextmanager diff --git a/train.py b/train.py index b4f2a6d2..ae867287 100644 --- a/train.py +++ b/train.py @@ -5,6 +5,7 @@ import time import traceback from functools import partial +from typing import List, Optional import torch import torch.distributed as dist @@ -12,11 +13,12 @@ import internlm from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.core.scheduler import SchedulerMetricHook +from internlm.core.scheduler import SchedulerHook from internlm.core.trainer import TrainState from internlm.initialize import initialize_distributed_env from internlm.model.loss import FlashGPTLMLoss -from internlm.model.metrics import AccPerplex +from internlm.model.metrics import AccPerplex, SchedulerMetricHook +from internlm.model.overlap_handler import FSTPOverlapSchedulerHook from internlm.monitor import initialize_monitor_manager, send_alert_message from internlm.monitor.monitor import monitor_manager as mm from internlm.train import ( @@ -67,6 +69,30 @@ def initialize_llm_logger(start_time: str): return uniscale_logger +def get_scheduler_hooks( + metric: Optional[AccPerplex] = None, activation_checkpoint: bool = False +) -> List[SchedulerHook]: + scheduler_hooks: List[SchedulerHook] = [] + + if metric is not None: + scheduler_hooks.append( + SchedulerMetricHook( + metric=metric, + skip=( + gpc.is_using_pp() + and hasattr(gpc.config.model, "num_chunks") + and gpc.config.model.num_chunks > 1 + and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) + ), + ), + ) + + if activation_checkpoint: + scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler)) + + return scheduler_hooks + + def main(args): # init setting skip_batches = gpc.config.data.skip_batches @@ -149,17 +175,6 @@ def main(args): ) # initialize trainer - scheduler_hooks = [ - SchedulerMetricHook( - metric=metric, - skip=( - gpc.is_using_pp() - and hasattr(gpc.config.model, "num_chunks") - and gpc.config.model.num_chunks > 1 - and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) - ), - ), - ] trainer, train_dl, _, _ = internlm.initialize_trainer( model=model, @@ -168,7 +183,7 @@ def main(args): train_dataloader=train_dl, lr_scheduler=lr_scheduler, beta2_scheduler=beta2_scheduler, - scheduler_hooks=scheduler_hooks, + scheduler_hooks=get_scheduler_hooks(metric, gpc.config.model.checkpoint), ) # initialize simple memory profiler From 5d8313693b01769a4239d7938667c3d01a5a3d90 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 24 Oct 2023 17:29:09 +0800 Subject: [PATCH 051/153] feat(model/overlap_handler.py): fix head post backward hook when activation --- internlm/model/overlap_handler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 098fc8c8..5cef92f9 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -244,7 +244,8 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # self.fstp_global_handle[first_backward_module] = weight_handle def _pre_backward_hook_for_head(module: nn.Module, grad_output): - self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1) + if self.is_forward is False: + self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1) def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 # wait handle for current module @@ -276,7 +277,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: for embedding in self.embedding: embedding.register_forward_hook(_post_forward_hook_for_embedding) - if self.model_checkpoint and self.is_forward is False: + if self.model_checkpoint: for head in self.head: head.register_full_backward_pre_hook(_pre_backward_hook_for_head) @@ -291,7 +292,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: # 1. register post_backward_hook @head module to prefetch for the last block's last module # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module # 3. register post_backward_hook @fstp_module to release resource - if gpc.config.model.checkpoint is False: + if self.model_checkpoint is False: for head in self.head: head.register_full_backward_hook(_post_backward_hook_for_head) From 262de4b796104253139c8082f6f51402618a425e Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Tue, 24 Oct 2023 17:54:26 +0800 Subject: [PATCH 052/153] support tflops computation and generate test py files --- .gitignore | 6 + configs/13B_template.py | 180 +++++++++++++++++ .../13B_train/131072_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/131072_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/131072_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/131072_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/131072_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/131072_intern_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/131072_megatron_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/131072_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/131072_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/131072_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/16384_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/16384_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/16384_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/16384_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/16384_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/16384_intern_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/16384_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/16384_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/16384_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/16384_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/262144_flash-attn_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/262144_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/262144_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/262144_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/262144_intern_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/262144_megatron_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/262144_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/262144_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/262144_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/32768_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/32768_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/32768_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/32768_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/32768_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/32768_intern_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/32768_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/32768_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/32768_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/32768_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/4096_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/4096_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/4096_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/4096_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/4096_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/4096_intern_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/4096_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/4096_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/4096_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/4096_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/65536_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/65536_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/65536_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/65536_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/65536_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/65536_intern_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/65536_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/65536_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/65536_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/65536_none_ckpt_True.py | 180 +++++++++++++++++ .../13B_train/8192_flash-attn_ckpt_False.py | 180 +++++++++++++++++ .../13B_train/8192_flash-attn_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/8192_flash_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/8192_flash_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/8192_intern_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/8192_intern_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/8192_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/8192_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/13B_train/8192_none_ckpt_False.py | 180 +++++++++++++++++ configs/13B_train/8192_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_template.py | 180 +++++++++++++++++ configs/30B_train/131072_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/131072_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/131072_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/131072_intern_ckpt_True.py | 180 +++++++++++++++++ .../30B_train/131072_megatron_ckpt_False.py | 180 +++++++++++++++++ .../30B_train/131072_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/131072_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/131072_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/16384_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/16384_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/16384_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/16384_intern_ckpt_True.py | 180 +++++++++++++++++ .../30B_train/16384_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/16384_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/16384_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/16384_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/262144_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/262144_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/262144_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/262144_intern_ckpt_True.py | 180 +++++++++++++++++ .../30B_train/262144_megatron_ckpt_False.py | 180 +++++++++++++++++ .../30B_train/262144_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/262144_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/262144_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/32768_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/32768_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/32768_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/32768_intern_ckpt_True.py | 180 +++++++++++++++++ .../30B_train/32768_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/32768_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/32768_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/32768_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/4096_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/4096_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/4096_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/4096_intern_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/4096_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/4096_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/4096_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/4096_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/65536_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/65536_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/65536_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/65536_intern_ckpt_True.py | 180 +++++++++++++++++ .../30B_train/65536_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/65536_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/65536_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/65536_none_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/8192_flash_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/8192_flash_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/8192_intern_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/8192_intern_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/8192_megatron_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/8192_megatron_ckpt_True.py | 180 +++++++++++++++++ configs/30B_train/8192_none_ckpt_False.py | 180 +++++++++++++++++ configs/30B_train/8192_none_ckpt_True.py | 180 +++++++++++++++++ configs/7B_sft.py | 12 +- configs/7B_template.py | 181 ++++++++++++++++++ .../7B_train/131072_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ .../7B_train/131072_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/131072_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/131072_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/131072_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/131072_intern_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/131072_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/131072_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/131072_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/131072_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/16384_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ .../7B_train/16384_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/16384_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/16384_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/16384_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/16384_intern_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/16384_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/16384_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/16384_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/16384_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/262144_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ .../7B_train/262144_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/262144_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/262144_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/262144_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/262144_intern_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/262144_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/262144_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/262144_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/262144_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/32768_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ .../7B_train/32768_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/32768_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/32768_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/32768_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/32768_intern_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/32768_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/32768_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/32768_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/32768_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/4096_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/4096_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/4096_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/4096_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/4096_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/4096_intern_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/4096_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/4096_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/4096_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/4096_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/65536_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ .../7B_train/65536_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/65536_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/65536_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/65536_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/65536_intern_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/65536_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/65536_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/65536_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/65536_none_ckpt_True.py | 181 ++++++++++++++++++ .../7B_train/8192_flash-attn_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/8192_flash-attn_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/8192_flash_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/8192_flash_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/8192_intern_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/8192_intern_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/8192_megatron_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/8192_megatron_ckpt_True.py | 181 ++++++++++++++++++ configs/7B_train/8192_none_ckpt_False.py | 181 ++++++++++++++++++ configs/7B_train/8192_none_ckpt_True.py | 181 ++++++++++++++++++ configs/generate.py | 44 +++++ internlm/train/training_internlm.py | 8 + 202 files changed, 35775 insertions(+), 6 deletions(-) create mode 100644 configs/13B_template.py create mode 100644 configs/13B_train/131072_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/131072_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/131072_flash_ckpt_False.py create mode 100644 configs/13B_train/131072_flash_ckpt_True.py create mode 100644 configs/13B_train/131072_intern_ckpt_False.py create mode 100644 configs/13B_train/131072_intern_ckpt_True.py create mode 100644 configs/13B_train/131072_megatron_ckpt_False.py create mode 100644 configs/13B_train/131072_megatron_ckpt_True.py create mode 100644 configs/13B_train/131072_none_ckpt_False.py create mode 100644 configs/13B_train/131072_none_ckpt_True.py create mode 100644 configs/13B_train/16384_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/16384_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/16384_flash_ckpt_False.py create mode 100644 configs/13B_train/16384_flash_ckpt_True.py create mode 100644 configs/13B_train/16384_intern_ckpt_False.py create mode 100644 configs/13B_train/16384_intern_ckpt_True.py create mode 100644 configs/13B_train/16384_megatron_ckpt_False.py create mode 100644 configs/13B_train/16384_megatron_ckpt_True.py create mode 100644 configs/13B_train/16384_none_ckpt_False.py create mode 100644 configs/13B_train/16384_none_ckpt_True.py create mode 100644 configs/13B_train/262144_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/262144_flash_ckpt_False.py create mode 100644 configs/13B_train/262144_flash_ckpt_True.py create mode 100644 configs/13B_train/262144_intern_ckpt_False.py create mode 100644 configs/13B_train/262144_intern_ckpt_True.py create mode 100644 configs/13B_train/262144_megatron_ckpt_False.py create mode 100644 configs/13B_train/262144_megatron_ckpt_True.py create mode 100644 configs/13B_train/262144_none_ckpt_False.py create mode 100644 configs/13B_train/262144_none_ckpt_True.py create mode 100644 configs/13B_train/32768_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/32768_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/32768_flash_ckpt_False.py create mode 100644 configs/13B_train/32768_flash_ckpt_True.py create mode 100644 configs/13B_train/32768_intern_ckpt_False.py create mode 100644 configs/13B_train/32768_intern_ckpt_True.py create mode 100644 configs/13B_train/32768_megatron_ckpt_False.py create mode 100644 configs/13B_train/32768_megatron_ckpt_True.py create mode 100644 configs/13B_train/32768_none_ckpt_False.py create mode 100644 configs/13B_train/32768_none_ckpt_True.py create mode 100644 configs/13B_train/4096_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/4096_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/4096_flash_ckpt_False.py create mode 100644 configs/13B_train/4096_flash_ckpt_True.py create mode 100644 configs/13B_train/4096_intern_ckpt_False.py create mode 100644 configs/13B_train/4096_intern_ckpt_True.py create mode 100644 configs/13B_train/4096_megatron_ckpt_False.py create mode 100644 configs/13B_train/4096_megatron_ckpt_True.py create mode 100644 configs/13B_train/4096_none_ckpt_False.py create mode 100644 configs/13B_train/4096_none_ckpt_True.py create mode 100644 configs/13B_train/65536_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/65536_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/65536_flash_ckpt_False.py create mode 100644 configs/13B_train/65536_flash_ckpt_True.py create mode 100644 configs/13B_train/65536_intern_ckpt_False.py create mode 100644 configs/13B_train/65536_intern_ckpt_True.py create mode 100644 configs/13B_train/65536_megatron_ckpt_False.py create mode 100644 configs/13B_train/65536_megatron_ckpt_True.py create mode 100644 configs/13B_train/65536_none_ckpt_False.py create mode 100644 configs/13B_train/65536_none_ckpt_True.py create mode 100644 configs/13B_train/8192_flash-attn_ckpt_False.py create mode 100644 configs/13B_train/8192_flash-attn_ckpt_True.py create mode 100644 configs/13B_train/8192_flash_ckpt_False.py create mode 100644 configs/13B_train/8192_flash_ckpt_True.py create mode 100644 configs/13B_train/8192_intern_ckpt_False.py create mode 100644 configs/13B_train/8192_intern_ckpt_True.py create mode 100644 configs/13B_train/8192_megatron_ckpt_False.py create mode 100644 configs/13B_train/8192_megatron_ckpt_True.py create mode 100644 configs/13B_train/8192_none_ckpt_False.py create mode 100644 configs/13B_train/8192_none_ckpt_True.py create mode 100644 configs/30B_template.py create mode 100644 configs/30B_train/131072_flash_ckpt_False.py create mode 100644 configs/30B_train/131072_flash_ckpt_True.py create mode 100644 configs/30B_train/131072_intern_ckpt_False.py create mode 100644 configs/30B_train/131072_intern_ckpt_True.py create mode 100644 configs/30B_train/131072_megatron_ckpt_False.py create mode 100644 configs/30B_train/131072_megatron_ckpt_True.py create mode 100644 configs/30B_train/131072_none_ckpt_False.py create mode 100644 configs/30B_train/131072_none_ckpt_True.py create mode 100644 configs/30B_train/16384_flash_ckpt_False.py create mode 100644 configs/30B_train/16384_flash_ckpt_True.py create mode 100644 configs/30B_train/16384_intern_ckpt_False.py create mode 100644 configs/30B_train/16384_intern_ckpt_True.py create mode 100644 configs/30B_train/16384_megatron_ckpt_False.py create mode 100644 configs/30B_train/16384_megatron_ckpt_True.py create mode 100644 configs/30B_train/16384_none_ckpt_False.py create mode 100644 configs/30B_train/16384_none_ckpt_True.py create mode 100644 configs/30B_train/262144_flash_ckpt_False.py create mode 100644 configs/30B_train/262144_flash_ckpt_True.py create mode 100644 configs/30B_train/262144_intern_ckpt_False.py create mode 100644 configs/30B_train/262144_intern_ckpt_True.py create mode 100644 configs/30B_train/262144_megatron_ckpt_False.py create mode 100644 configs/30B_train/262144_megatron_ckpt_True.py create mode 100644 configs/30B_train/262144_none_ckpt_False.py create mode 100644 configs/30B_train/262144_none_ckpt_True.py create mode 100644 configs/30B_train/32768_flash_ckpt_False.py create mode 100644 configs/30B_train/32768_flash_ckpt_True.py create mode 100644 configs/30B_train/32768_intern_ckpt_False.py create mode 100644 configs/30B_train/32768_intern_ckpt_True.py create mode 100644 configs/30B_train/32768_megatron_ckpt_False.py create mode 100644 configs/30B_train/32768_megatron_ckpt_True.py create mode 100644 configs/30B_train/32768_none_ckpt_False.py create mode 100644 configs/30B_train/32768_none_ckpt_True.py create mode 100644 configs/30B_train/4096_flash_ckpt_False.py create mode 100644 configs/30B_train/4096_flash_ckpt_True.py create mode 100644 configs/30B_train/4096_intern_ckpt_False.py create mode 100644 configs/30B_train/4096_intern_ckpt_True.py create mode 100644 configs/30B_train/4096_megatron_ckpt_False.py create mode 100644 configs/30B_train/4096_megatron_ckpt_True.py create mode 100644 configs/30B_train/4096_none_ckpt_False.py create mode 100644 configs/30B_train/4096_none_ckpt_True.py create mode 100644 configs/30B_train/65536_flash_ckpt_False.py create mode 100644 configs/30B_train/65536_flash_ckpt_True.py create mode 100644 configs/30B_train/65536_intern_ckpt_False.py create mode 100644 configs/30B_train/65536_intern_ckpt_True.py create mode 100644 configs/30B_train/65536_megatron_ckpt_False.py create mode 100644 configs/30B_train/65536_megatron_ckpt_True.py create mode 100644 configs/30B_train/65536_none_ckpt_False.py create mode 100644 configs/30B_train/65536_none_ckpt_True.py create mode 100644 configs/30B_train/8192_flash_ckpt_False.py create mode 100644 configs/30B_train/8192_flash_ckpt_True.py create mode 100644 configs/30B_train/8192_intern_ckpt_False.py create mode 100644 configs/30B_train/8192_intern_ckpt_True.py create mode 100644 configs/30B_train/8192_megatron_ckpt_False.py create mode 100644 configs/30B_train/8192_megatron_ckpt_True.py create mode 100644 configs/30B_train/8192_none_ckpt_False.py create mode 100644 configs/30B_train/8192_none_ckpt_True.py create mode 100644 configs/7B_template.py create mode 100644 configs/7B_train/131072_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/131072_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/131072_flash_ckpt_False.py create mode 100644 configs/7B_train/131072_flash_ckpt_True.py create mode 100644 configs/7B_train/131072_intern_ckpt_False.py create mode 100644 configs/7B_train/131072_intern_ckpt_True.py create mode 100644 configs/7B_train/131072_megatron_ckpt_False.py create mode 100644 configs/7B_train/131072_megatron_ckpt_True.py create mode 100644 configs/7B_train/131072_none_ckpt_False.py create mode 100644 configs/7B_train/131072_none_ckpt_True.py create mode 100644 configs/7B_train/16384_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/16384_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/16384_flash_ckpt_False.py create mode 100644 configs/7B_train/16384_flash_ckpt_True.py create mode 100644 configs/7B_train/16384_intern_ckpt_False.py create mode 100644 configs/7B_train/16384_intern_ckpt_True.py create mode 100644 configs/7B_train/16384_megatron_ckpt_False.py create mode 100644 configs/7B_train/16384_megatron_ckpt_True.py create mode 100644 configs/7B_train/16384_none_ckpt_False.py create mode 100644 configs/7B_train/16384_none_ckpt_True.py create mode 100644 configs/7B_train/262144_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/262144_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/262144_flash_ckpt_False.py create mode 100644 configs/7B_train/262144_flash_ckpt_True.py create mode 100644 configs/7B_train/262144_intern_ckpt_False.py create mode 100644 configs/7B_train/262144_intern_ckpt_True.py create mode 100644 configs/7B_train/262144_megatron_ckpt_False.py create mode 100644 configs/7B_train/262144_megatron_ckpt_True.py create mode 100644 configs/7B_train/262144_none_ckpt_False.py create mode 100644 configs/7B_train/262144_none_ckpt_True.py create mode 100644 configs/7B_train/32768_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/32768_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/32768_flash_ckpt_False.py create mode 100644 configs/7B_train/32768_flash_ckpt_True.py create mode 100644 configs/7B_train/32768_intern_ckpt_False.py create mode 100644 configs/7B_train/32768_intern_ckpt_True.py create mode 100644 configs/7B_train/32768_megatron_ckpt_False.py create mode 100644 configs/7B_train/32768_megatron_ckpt_True.py create mode 100644 configs/7B_train/32768_none_ckpt_False.py create mode 100644 configs/7B_train/32768_none_ckpt_True.py create mode 100644 configs/7B_train/4096_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/4096_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/4096_flash_ckpt_False.py create mode 100644 configs/7B_train/4096_flash_ckpt_True.py create mode 100644 configs/7B_train/4096_intern_ckpt_False.py create mode 100644 configs/7B_train/4096_intern_ckpt_True.py create mode 100644 configs/7B_train/4096_megatron_ckpt_False.py create mode 100644 configs/7B_train/4096_megatron_ckpt_True.py create mode 100644 configs/7B_train/4096_none_ckpt_False.py create mode 100644 configs/7B_train/4096_none_ckpt_True.py create mode 100644 configs/7B_train/65536_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/65536_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/65536_flash_ckpt_False.py create mode 100644 configs/7B_train/65536_flash_ckpt_True.py create mode 100644 configs/7B_train/65536_intern_ckpt_False.py create mode 100644 configs/7B_train/65536_intern_ckpt_True.py create mode 100644 configs/7B_train/65536_megatron_ckpt_False.py create mode 100644 configs/7B_train/65536_megatron_ckpt_True.py create mode 100644 configs/7B_train/65536_none_ckpt_False.py create mode 100644 configs/7B_train/65536_none_ckpt_True.py create mode 100644 configs/7B_train/8192_flash-attn_ckpt_False.py create mode 100644 configs/7B_train/8192_flash-attn_ckpt_True.py create mode 100644 configs/7B_train/8192_flash_ckpt_False.py create mode 100644 configs/7B_train/8192_flash_ckpt_True.py create mode 100644 configs/7B_train/8192_intern_ckpt_False.py create mode 100644 configs/7B_train/8192_intern_ckpt_True.py create mode 100644 configs/7B_train/8192_megatron_ckpt_False.py create mode 100644 configs/7B_train/8192_megatron_ckpt_True.py create mode 100644 configs/7B_train/8192_none_ckpt_False.py create mode 100644 configs/7B_train/8192_none_ckpt_True.py create mode 100644 configs/generate.py diff --git a/.gitignore b/.gitignore index 8992a0f5..04367e3d 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,9 @@ core.* llm_ckpts events.* memory_trace +7b_train*/ +13b_train*/ +30b_train*/ +fstp_logs/ +atb +pip diff --git a/configs/13B_template.py b/configs/13B_template.py new file mode 100644 index 00000000..26be3f71 --- /dev/null +++ b/configs/13B_template.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = {seq_len} +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py new file mode 100644 index 00000000..28d51af6 --- /dev/null +++ b/configs/13B_train/131072_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py new file mode 100644 index 00000000..6d1b7ef0 --- /dev/null +++ b/configs/13B_train/131072_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py new file mode 100644 index 00000000..dd0f0e89 --- /dev/null +++ b/configs/13B_train/131072_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py new file mode 100644 index 00000000..2b9276db --- /dev/null +++ b/configs/13B_train/131072_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py new file mode 100644 index 00000000..182e4ddb --- /dev/null +++ b/configs/13B_train/131072_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py new file mode 100644 index 00000000..c23a3c10 --- /dev/null +++ b/configs/13B_train/131072_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py new file mode 100644 index 00000000..935ff98d --- /dev/null +++ b/configs/13B_train/131072_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py new file mode 100644 index 00000000..441166c2 --- /dev/null +++ b/configs/13B_train/131072_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py new file mode 100644 index 00000000..e43d6044 --- /dev/null +++ b/configs/13B_train/131072_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py new file mode 100644 index 00000000..0945dbdc --- /dev/null +++ b/configs/13B_train/131072_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py new file mode 100644 index 00000000..393e54d3 --- /dev/null +++ b/configs/13B_train/16384_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py new file mode 100644 index 00000000..7f7e7ac6 --- /dev/null +++ b/configs/13B_train/16384_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py new file mode 100644 index 00000000..cadd215f --- /dev/null +++ b/configs/13B_train/16384_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py new file mode 100644 index 00000000..c60ea730 --- /dev/null +++ b/configs/13B_train/16384_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py new file mode 100644 index 00000000..e5d6fa6b --- /dev/null +++ b/configs/13B_train/16384_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py new file mode 100644 index 00000000..6ac47ac2 --- /dev/null +++ b/configs/13B_train/16384_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py new file mode 100644 index 00000000..24429ead --- /dev/null +++ b/configs/13B_train/16384_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py new file mode 100644 index 00000000..d79c8207 --- /dev/null +++ b/configs/13B_train/16384_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py new file mode 100644 index 00000000..a30d713a --- /dev/null +++ b/configs/13B_train/16384_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py new file mode 100644 index 00000000..76483257 --- /dev/null +++ b/configs/13B_train/16384_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py new file mode 100644 index 00000000..fd0be6a7 --- /dev/null +++ b/configs/13B_train/262144_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py new file mode 100644 index 00000000..5ca332ef --- /dev/null +++ b/configs/13B_train/262144_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py new file mode 100644 index 00000000..f990655a --- /dev/null +++ b/configs/13B_train/262144_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py new file mode 100644 index 00000000..7ebcf94f --- /dev/null +++ b/configs/13B_train/262144_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py new file mode 100644 index 00000000..e958ac06 --- /dev/null +++ b/configs/13B_train/262144_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py new file mode 100644 index 00000000..31e96f78 --- /dev/null +++ b/configs/13B_train/262144_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py new file mode 100644 index 00000000..2339244b --- /dev/null +++ b/configs/13B_train/262144_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py new file mode 100644 index 00000000..41d55e91 --- /dev/null +++ b/configs/13B_train/262144_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py new file mode 100644 index 00000000..4f2da605 --- /dev/null +++ b/configs/13B_train/262144_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py new file mode 100644 index 00000000..3eb0f493 --- /dev/null +++ b/configs/13B_train/32768_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py new file mode 100644 index 00000000..26b06ef3 --- /dev/null +++ b/configs/13B_train/32768_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py new file mode 100644 index 00000000..da30a4dd --- /dev/null +++ b/configs/13B_train/32768_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py new file mode 100644 index 00000000..20d415a5 --- /dev/null +++ b/configs/13B_train/32768_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py new file mode 100644 index 00000000..05ab5285 --- /dev/null +++ b/configs/13B_train/32768_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py new file mode 100644 index 00000000..273a812d --- /dev/null +++ b/configs/13B_train/32768_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py new file mode 100644 index 00000000..c8db542d --- /dev/null +++ b/configs/13B_train/32768_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py new file mode 100644 index 00000000..9ff56012 --- /dev/null +++ b/configs/13B_train/32768_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py new file mode 100644 index 00000000..a02e0711 --- /dev/null +++ b/configs/13B_train/32768_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py new file mode 100644 index 00000000..b9b17e3c --- /dev/null +++ b/configs/13B_train/32768_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py new file mode 100644 index 00000000..8e4459ea --- /dev/null +++ b/configs/13B_train/4096_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py new file mode 100644 index 00000000..a8f5e39b --- /dev/null +++ b/configs/13B_train/4096_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py new file mode 100644 index 00000000..517b46e4 --- /dev/null +++ b/configs/13B_train/4096_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py new file mode 100644 index 00000000..eacfcdfd --- /dev/null +++ b/configs/13B_train/4096_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py new file mode 100644 index 00000000..5ecf2d66 --- /dev/null +++ b/configs/13B_train/4096_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py new file mode 100644 index 00000000..b70acb01 --- /dev/null +++ b/configs/13B_train/4096_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py new file mode 100644 index 00000000..2e847a64 --- /dev/null +++ b/configs/13B_train/4096_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py new file mode 100644 index 00000000..d8ba2c57 --- /dev/null +++ b/configs/13B_train/4096_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py new file mode 100644 index 00000000..f8bbdfc5 --- /dev/null +++ b/configs/13B_train/4096_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py new file mode 100644 index 00000000..d8f8ec7e --- /dev/null +++ b/configs/13B_train/4096_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py new file mode 100644 index 00000000..09367f5a --- /dev/null +++ b/configs/13B_train/65536_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py new file mode 100644 index 00000000..dc283a92 --- /dev/null +++ b/configs/13B_train/65536_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py new file mode 100644 index 00000000..482d5114 --- /dev/null +++ b/configs/13B_train/65536_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py new file mode 100644 index 00000000..66051f83 --- /dev/null +++ b/configs/13B_train/65536_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py new file mode 100644 index 00000000..f829652a --- /dev/null +++ b/configs/13B_train/65536_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py new file mode 100644 index 00000000..4e94d0e3 --- /dev/null +++ b/configs/13B_train/65536_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py new file mode 100644 index 00000000..a9293334 --- /dev/null +++ b/configs/13B_train/65536_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py new file mode 100644 index 00000000..845e32bc --- /dev/null +++ b/configs/13B_train/65536_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py new file mode 100644 index 00000000..52ce3c52 --- /dev/null +++ b/configs/13B_train/65536_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py new file mode 100644 index 00000000..de5532e1 --- /dev/null +++ b/configs/13B_train/65536_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py new file mode 100644 index 00000000..3324c290 --- /dev/null +++ b/configs/13B_train/8192_flash-attn_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py new file mode 100644 index 00000000..317e0f32 --- /dev/null +++ b/configs/13B_train/8192_flash-attn_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py new file mode 100644 index 00000000..d645dc1b --- /dev/null +++ b/configs/13B_train/8192_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py new file mode 100644 index 00000000..425859c0 --- /dev/null +++ b/configs/13B_train/8192_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py new file mode 100644 index 00000000..0b4fb8a2 --- /dev/null +++ b/configs/13B_train/8192_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py new file mode 100644 index 00000000..b42cb769 --- /dev/null +++ b/configs/13B_train/8192_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py new file mode 100644 index 00000000..e2191937 --- /dev/null +++ b/configs/13B_train/8192_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py new file mode 100644 index 00000000..5123c412 --- /dev/null +++ b/configs/13B_train/8192_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py new file mode 100644 index 00000000..c9d9c050 --- /dev/null +++ b/configs/13B_train/8192_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py new file mode 100644 index 00000000..182ec21f --- /dev/null +++ b/configs/13B_train/8192_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 5120 +NUM_ATTENTION_HEAD = 40 +MLP_RATIO = 8 / 3 +NUM_LAYER = 40 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_template.py b/configs/30B_template.py new file mode 100644 index 00000000..7a32015e --- /dev/null +++ b/configs/30B_template.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = {seq_len} +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py new file mode 100644 index 00000000..3af48f3e --- /dev/null +++ b/configs/30B_train/131072_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py new file mode 100644 index 00000000..4bd249bc --- /dev/null +++ b/configs/30B_train/131072_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py new file mode 100644 index 00000000..77b176d2 --- /dev/null +++ b/configs/30B_train/131072_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py new file mode 100644 index 00000000..38a1db3b --- /dev/null +++ b/configs/30B_train/131072_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py new file mode 100644 index 00000000..49879303 --- /dev/null +++ b/configs/30B_train/131072_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py new file mode 100644 index 00000000..d911d381 --- /dev/null +++ b/configs/30B_train/131072_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py new file mode 100644 index 00000000..78b3c9a8 --- /dev/null +++ b/configs/30B_train/131072_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py new file mode 100644 index 00000000..941279e7 --- /dev/null +++ b/configs/30B_train/131072_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py new file mode 100644 index 00000000..779a10bc --- /dev/null +++ b/configs/30B_train/16384_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py new file mode 100644 index 00000000..0498e2c4 --- /dev/null +++ b/configs/30B_train/16384_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py new file mode 100644 index 00000000..309a33f0 --- /dev/null +++ b/configs/30B_train/16384_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py new file mode 100644 index 00000000..23c977a5 --- /dev/null +++ b/configs/30B_train/16384_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py new file mode 100644 index 00000000..8576aa76 --- /dev/null +++ b/configs/30B_train/16384_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py new file mode 100644 index 00000000..460aba3b --- /dev/null +++ b/configs/30B_train/16384_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py new file mode 100644 index 00000000..4ca50666 --- /dev/null +++ b/configs/30B_train/16384_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py new file mode 100644 index 00000000..c7987e0d --- /dev/null +++ b/configs/30B_train/16384_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py new file mode 100644 index 00000000..10d71d9c --- /dev/null +++ b/configs/30B_train/262144_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py new file mode 100644 index 00000000..a1990dbb --- /dev/null +++ b/configs/30B_train/262144_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py new file mode 100644 index 00000000..f8ec6a2f --- /dev/null +++ b/configs/30B_train/262144_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py new file mode 100644 index 00000000..c5afa46b --- /dev/null +++ b/configs/30B_train/262144_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py new file mode 100644 index 00000000..412da179 --- /dev/null +++ b/configs/30B_train/262144_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py new file mode 100644 index 00000000..79affb19 --- /dev/null +++ b/configs/30B_train/262144_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py new file mode 100644 index 00000000..e6fbe1eb --- /dev/null +++ b/configs/30B_train/262144_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py new file mode 100644 index 00000000..d507c30b --- /dev/null +++ b/configs/30B_train/262144_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py new file mode 100644 index 00000000..6bac5b31 --- /dev/null +++ b/configs/30B_train/32768_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py new file mode 100644 index 00000000..f21c9983 --- /dev/null +++ b/configs/30B_train/32768_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py new file mode 100644 index 00000000..79728d64 --- /dev/null +++ b/configs/30B_train/32768_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py new file mode 100644 index 00000000..6dc24c30 --- /dev/null +++ b/configs/30B_train/32768_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py new file mode 100644 index 00000000..37fd0986 --- /dev/null +++ b/configs/30B_train/32768_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py new file mode 100644 index 00000000..986b27dd --- /dev/null +++ b/configs/30B_train/32768_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py new file mode 100644 index 00000000..9c6ca879 --- /dev/null +++ b/configs/30B_train/32768_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py new file mode 100644 index 00000000..d4ab7f2d --- /dev/null +++ b/configs/30B_train/32768_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py new file mode 100644 index 00000000..3dd8be56 --- /dev/null +++ b/configs/30B_train/4096_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py new file mode 100644 index 00000000..73150acf --- /dev/null +++ b/configs/30B_train/4096_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py new file mode 100644 index 00000000..cff6c5b6 --- /dev/null +++ b/configs/30B_train/4096_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py new file mode 100644 index 00000000..1fb64257 --- /dev/null +++ b/configs/30B_train/4096_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py new file mode 100644 index 00000000..79f718d0 --- /dev/null +++ b/configs/30B_train/4096_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py new file mode 100644 index 00000000..502ae7f7 --- /dev/null +++ b/configs/30B_train/4096_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py new file mode 100644 index 00000000..981a0f23 --- /dev/null +++ b/configs/30B_train/4096_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py new file mode 100644 index 00000000..dddea663 --- /dev/null +++ b/configs/30B_train/4096_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py new file mode 100644 index 00000000..babebd95 --- /dev/null +++ b/configs/30B_train/65536_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py new file mode 100644 index 00000000..064250e7 --- /dev/null +++ b/configs/30B_train/65536_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py new file mode 100644 index 00000000..64165f44 --- /dev/null +++ b/configs/30B_train/65536_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py new file mode 100644 index 00000000..78b66213 --- /dev/null +++ b/configs/30B_train/65536_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py new file mode 100644 index 00000000..e8c09548 --- /dev/null +++ b/configs/30B_train/65536_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py new file mode 100644 index 00000000..d3b64c41 --- /dev/null +++ b/configs/30B_train/65536_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py new file mode 100644 index 00000000..ee4c7fb5 --- /dev/null +++ b/configs/30B_train/65536_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py new file mode 100644 index 00000000..2e84144c --- /dev/null +++ b/configs/30B_train/65536_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py new file mode 100644 index 00000000..b9eb6e65 --- /dev/null +++ b/configs/30B_train/8192_flash_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py new file mode 100644 index 00000000..c0dd5175 --- /dev/null +++ b/configs/30B_train/8192_flash_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py new file mode 100644 index 00000000..d915b6b8 --- /dev/null +++ b/configs/30B_train/8192_intern_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py new file mode 100644 index 00000000..a71693a1 --- /dev/null +++ b/configs/30B_train/8192_intern_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py new file mode 100644 index 00000000..dcacb9e5 --- /dev/null +++ b/configs/30B_train/8192_megatron_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py new file mode 100644 index 00000000..b6e4ba24 --- /dev/null +++ b/configs/30B_train/8192_megatron_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py new file mode 100644 index 00000000..ce790dfa --- /dev/null +++ b/configs/30B_train/8192_none_ckpt_False.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py new file mode 100644 index 00000000..e6afcd4e --- /dev/null +++ b/configs/30B_train/8192_none_ckpt_True.py @@ -0,0 +1,180 @@ + +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) +HIDDEN_SIZE = 6144 +NUM_ATTENTION_HEAD = 48 +MLP_RATIO = 8 / 3 +NUM_LAYER = 60 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', + the sequence_parallel should be True. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +sequence parallel (bool): enable/disable sequence parallel, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index c51c8129..4f482656 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -49,14 +49,14 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, + micro_num=1, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=4, + micro_bsz=1, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=False, + pack_sample_into_one=True, total_steps=20, skip_batches="", rampup_batch_size="", @@ -64,7 +64,7 @@ min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) @@ -90,7 +90,7 @@ hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), + tensor=dict(size=8, sp="none", intern_overlap=False), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/configs/7B_template.py b/configs/7B_template.py new file mode 100644 index 00000000..b9f76a51 --- /dev/null +++ b/configs/7B_template.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = {seq_len} +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py new file mode 100644 index 00000000..047fb372 --- /dev/null +++ b/configs/7B_train/131072_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py new file mode 100644 index 00000000..763627d6 --- /dev/null +++ b/configs/7B_train/131072_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py new file mode 100644 index 00000000..4307e9d1 --- /dev/null +++ b/configs/7B_train/131072_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py new file mode 100644 index 00000000..c110b256 --- /dev/null +++ b/configs/7B_train/131072_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py new file mode 100644 index 00000000..1d728be7 --- /dev/null +++ b/configs/7B_train/131072_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py new file mode 100644 index 00000000..45d4aa01 --- /dev/null +++ b/configs/7B_train/131072_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py new file mode 100644 index 00000000..0bd98459 --- /dev/null +++ b/configs/7B_train/131072_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py new file mode 100644 index 00000000..9200afbe --- /dev/null +++ b/configs/7B_train/131072_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py new file mode 100644 index 00000000..16059fb1 --- /dev/null +++ b/configs/7B_train/131072_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py new file mode 100644 index 00000000..35b3f08e --- /dev/null +++ b/configs/7B_train/131072_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 131072 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py new file mode 100644 index 00000000..53a64b99 --- /dev/null +++ b/configs/7B_train/16384_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py new file mode 100644 index 00000000..cdb051e5 --- /dev/null +++ b/configs/7B_train/16384_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py new file mode 100644 index 00000000..41b39515 --- /dev/null +++ b/configs/7B_train/16384_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py new file mode 100644 index 00000000..ca2c7f06 --- /dev/null +++ b/configs/7B_train/16384_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py new file mode 100644 index 00000000..93abb682 --- /dev/null +++ b/configs/7B_train/16384_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py new file mode 100644 index 00000000..af9d9945 --- /dev/null +++ b/configs/7B_train/16384_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py new file mode 100644 index 00000000..d2c58d3a --- /dev/null +++ b/configs/7B_train/16384_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py new file mode 100644 index 00000000..6e372b8c --- /dev/null +++ b/configs/7B_train/16384_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py new file mode 100644 index 00000000..0fd65900 --- /dev/null +++ b/configs/7B_train/16384_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py new file mode 100644 index 00000000..6ea5e1a9 --- /dev/null +++ b/configs/7B_train/16384_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 16384 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py new file mode 100644 index 00000000..6dad9730 --- /dev/null +++ b/configs/7B_train/262144_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py new file mode 100644 index 00000000..cacd9737 --- /dev/null +++ b/configs/7B_train/262144_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py new file mode 100644 index 00000000..0e9b0173 --- /dev/null +++ b/configs/7B_train/262144_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py new file mode 100644 index 00000000..ddacc8df --- /dev/null +++ b/configs/7B_train/262144_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py new file mode 100644 index 00000000..e5cf7694 --- /dev/null +++ b/configs/7B_train/262144_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py new file mode 100644 index 00000000..76f9386a --- /dev/null +++ b/configs/7B_train/262144_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py new file mode 100644 index 00000000..b929f9a6 --- /dev/null +++ b/configs/7B_train/262144_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py new file mode 100644 index 00000000..1655631c --- /dev/null +++ b/configs/7B_train/262144_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py new file mode 100644 index 00000000..85512f07 --- /dev/null +++ b/configs/7B_train/262144_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py new file mode 100644 index 00000000..fef559bd --- /dev/null +++ b/configs/7B_train/262144_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 262144 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py new file mode 100644 index 00000000..f2664be8 --- /dev/null +++ b/configs/7B_train/32768_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py new file mode 100644 index 00000000..232b5904 --- /dev/null +++ b/configs/7B_train/32768_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py new file mode 100644 index 00000000..878b9ac1 --- /dev/null +++ b/configs/7B_train/32768_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py new file mode 100644 index 00000000..27cffd02 --- /dev/null +++ b/configs/7B_train/32768_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py new file mode 100644 index 00000000..fcf84197 --- /dev/null +++ b/configs/7B_train/32768_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py new file mode 100644 index 00000000..aec2b68b --- /dev/null +++ b/configs/7B_train/32768_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py new file mode 100644 index 00000000..64caeeb5 --- /dev/null +++ b/configs/7B_train/32768_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py new file mode 100644 index 00000000..a736e7d0 --- /dev/null +++ b/configs/7B_train/32768_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py new file mode 100644 index 00000000..3a31776e --- /dev/null +++ b/configs/7B_train/32768_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py new file mode 100644 index 00000000..4ac09249 --- /dev/null +++ b/configs/7B_train/32768_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 32768 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py new file mode 100644 index 00000000..b3de8990 --- /dev/null +++ b/configs/7B_train/4096_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py new file mode 100644 index 00000000..b44b103f --- /dev/null +++ b/configs/7B_train/4096_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py new file mode 100644 index 00000000..8ac542d6 --- /dev/null +++ b/configs/7B_train/4096_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py new file mode 100644 index 00000000..ec477f68 --- /dev/null +++ b/configs/7B_train/4096_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py new file mode 100644 index 00000000..f16f95ad --- /dev/null +++ b/configs/7B_train/4096_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py new file mode 100644 index 00000000..90fed7c8 --- /dev/null +++ b/configs/7B_train/4096_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py new file mode 100644 index 00000000..ca41fa28 --- /dev/null +++ b/configs/7B_train/4096_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py new file mode 100644 index 00000000..45183156 --- /dev/null +++ b/configs/7B_train/4096_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py new file mode 100644 index 00000000..c81bb5b9 --- /dev/null +++ b/configs/7B_train/4096_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py new file mode 100644 index 00000000..a25d222f --- /dev/null +++ b/configs/7B_train/4096_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 4096 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py new file mode 100644 index 00000000..3d5a81eb --- /dev/null +++ b/configs/7B_train/65536_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py new file mode 100644 index 00000000..c6982c98 --- /dev/null +++ b/configs/7B_train/65536_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py new file mode 100644 index 00000000..0cfea813 --- /dev/null +++ b/configs/7B_train/65536_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py new file mode 100644 index 00000000..abdeb49d --- /dev/null +++ b/configs/7B_train/65536_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py new file mode 100644 index 00000000..2e0b27e1 --- /dev/null +++ b/configs/7B_train/65536_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py new file mode 100644 index 00000000..d1a8de7c --- /dev/null +++ b/configs/7B_train/65536_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py new file mode 100644 index 00000000..7de7b92d --- /dev/null +++ b/configs/7B_train/65536_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py new file mode 100644 index 00000000..b339c833 --- /dev/null +++ b/configs/7B_train/65536_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py new file mode 100644 index 00000000..b8c44769 --- /dev/null +++ b/configs/7B_train/65536_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py new file mode 100644 index 00000000..b907e437 --- /dev/null +++ b/configs/7B_train/65536_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 65536 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py new file mode 100644 index 00000000..d0ddd438 --- /dev/null +++ b/configs/7B_train/8192_flash-attn_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py new file mode 100644 index 00000000..d9e5b2f9 --- /dev/null +++ b/configs/7B_train/8192_flash-attn_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash-attn", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py new file mode 100644 index 00000000..69546d11 --- /dev/null +++ b/configs/7B_train/8192_flash_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py new file mode 100644 index 00000000..4c7f9864 --- /dev/null +++ b/configs/7B_train/8192_flash_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="flash", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py new file mode 100644 index 00000000..9694ad81 --- /dev/null +++ b/configs/7B_train/8192_intern_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py new file mode 100644 index 00000000..99a0fc18 --- /dev/null +++ b/configs/7B_train/8192_intern_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py new file mode 100644 index 00000000..f18ee730 --- /dev/null +++ b/configs/7B_train/8192_megatron_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py new file mode 100644 index 00000000..1db58412 --- /dev/null +++ b/configs/7B_train/8192_megatron_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="megatron", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py new file mode 100644 index 00000000..95d686bb --- /dev/null +++ b/configs/7B_train/8192_none_ckpt_False.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py new file mode 100644 index 00000000..a63b6f20 --- /dev/null +++ b/configs/7B_train/8192_none_ckpt_True.py @@ -0,0 +1,181 @@ +# JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 8192 +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=1, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=True, + total_steps=20, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=100, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], + defaults to 'none', means the sequence parallel will be disabled. + 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, + defaults to False. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +""" +parallel = dict( + zero1=dict(size=-1, fsdp=False), + tensor=dict(size=8, sp="none", intern_overlap=False), + pipeline=dict(size=1, interleaved_overlap=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) diff --git a/configs/generate.py b/configs/generate.py new file mode 100644 index 00000000..6a58f098 --- /dev/null +++ b/configs/generate.py @@ -0,0 +1,44 @@ +import os +import copy +import subprocess + +name = "./configs/" +root_names = ["7B_train_", "13B_train_", "30B_train_"] +model_size = ["7B", "13B", "30B"] +seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144] +sp = ["none", "megatron", "flash-attn", "intern"] +intern_overlap = [False, False, False, True] +checkpoint = [False, True] + +for idx, root_name in enumerate(root_names): + + # 指定要创建的文件夹路径 + folder_path = name + root_name[:-1] + + # 使用os.mkdir()创建文件夹 + if not os.path.exists(folder_path): + os.mkdir(folder_path) + + file_name = name + f"{model_size[idx]}_template.py" + + with open(file_name, "r") as f: + lines = f.readlines() + origin_line = "".join(lines) + for seq in seq_length: + for i, sp_mode in enumerate(sp): + for ckpt in checkpoint: + line = copy.copy(origin_line) + line = line.replace("{seq_len}", str(seq)) + line = line.replace("{sp}", f"\"{sp_mode}\"") + line = line.replace("{intern_overlap}", str(intern_overlap[i])) + line = line.replace("{checkpoint}", str(ckpt)) + output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py" + write_file = folder_path + "/" + output_file_name + with open(write_file, "w") as file: + file.write(line) + + log_name = root_name + "_" + output_file_name[:-3] + + command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" + process = subprocess.Popen(command, shell=True, executable='/bin/bash') + process.wait() \ No newline at end of file diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 5e874d39..0b605e53 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -396,6 +396,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): tgs_list = [] +tflops_list = [] @llm_timeout(func_name="record_current_batch_training_metrics") @@ -573,6 +574,7 @@ def record_current_batch_training_metrics( if batch_count >= 5: tgs_list.append(tgs_origin) + tflops_list.append(tflops) if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) avg_tgs = sum(tgs_list) / len(tgs_list) @@ -580,3 +582,9 @@ def record_current_batch_training_metrics( if abs(tgs - avg_tgs) > 400: tgs_list.remove(tgs) print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True) + print(tflops_list, flush=True) + avg_tflops = sum(tflops_list) / len(tflops_list) + for tf in tflops_list.copy(): + if abs(tf - avg_tflops) > 10: + tflops_list.remove(tf) + print(f"avg_tflops: {sum(tflops_list)/len(tflops_list)}", flush=True) From 41cfa1a10a673e74c64653afda8395309c0f7d75 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 24 Oct 2023 18:47:27 +0800 Subject: [PATCH 053/153] feat(model/overlap_handler.py): fix overlap handler None bug --- internlm/model/overlap_handler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 5cef92f9..35d8a594 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -312,7 +312,8 @@ def __init__(self, overlap_handler: FSTPOverlapHandler) -> None: self._overlap_handler = overlap_handler def before_forward(self, scheduler, inputs) -> None: - self._overlap_handler.set_forward_mode(True) + if self._overlap_handler is not None: + self._overlap_handler.set_forward_mode(True) def after_forward(self, scheduler, outputs) -> None: pass @@ -324,7 +325,8 @@ def after_criterion(self, scheduler, loss) -> None: pass def before_backward(self, scheduler, outputs, outputs_grad) -> None: - self._overlap_handler.set_forward_mode(False) + if self._overlap_handler is not None: + self._overlap_handler.set_forward_mode(False) def after_backward(self, scheduler, inputs_grad) -> None: pass From 0bac166b7a82a556e8f2ba301a4e4f6d353c8b1f Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 25 Oct 2023 13:44:15 +0800 Subject: [PATCH 054/153] add test --- .gitignore | 4 + configs/13B_template.py | 8 +- .../13B_train/131072_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/131072_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/131072_flash_ckpt_False.py | 180 ----------------- configs/13B_train/131072_flash_ckpt_True.py | 180 ----------------- configs/13B_train/131072_intern_ckpt_False.py | 180 ----------------- configs/13B_train/131072_intern_ckpt_True.py | 180 ----------------- .../13B_train/131072_megatron_ckpt_False.py | 180 ----------------- .../13B_train/131072_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/131072_none_ckpt_False.py | 180 ----------------- configs/13B_train/131072_none_ckpt_True.py | 180 ----------------- .../13B_train/16384_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/16384_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/16384_flash_ckpt_False.py | 180 ----------------- configs/13B_train/16384_flash_ckpt_True.py | 180 ----------------- configs/13B_train/16384_intern_ckpt_False.py | 180 ----------------- configs/13B_train/16384_intern_ckpt_True.py | 180 ----------------- .../13B_train/16384_megatron_ckpt_False.py | 180 ----------------- configs/13B_train/16384_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/16384_none_ckpt_False.py | 180 ----------------- configs/13B_train/16384_none_ckpt_True.py | 180 ----------------- .../13B_train/262144_flash-attn_ckpt_False.py | 180 ----------------- configs/13B_train/262144_flash_ckpt_False.py | 180 ----------------- configs/13B_train/262144_flash_ckpt_True.py | 180 ----------------- configs/13B_train/262144_intern_ckpt_False.py | 180 ----------------- configs/13B_train/262144_intern_ckpt_True.py | 180 ----------------- .../13B_train/262144_megatron_ckpt_False.py | 180 ----------------- .../13B_train/262144_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/262144_none_ckpt_False.py | 180 ----------------- configs/13B_train/262144_none_ckpt_True.py | 180 ----------------- .../13B_train/32768_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/32768_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/32768_flash_ckpt_False.py | 180 ----------------- configs/13B_train/32768_flash_ckpt_True.py | 180 ----------------- configs/13B_train/32768_intern_ckpt_False.py | 180 ----------------- configs/13B_train/32768_intern_ckpt_True.py | 180 ----------------- .../13B_train/32768_megatron_ckpt_False.py | 180 ----------------- configs/13B_train/32768_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/32768_none_ckpt_False.py | 180 ----------------- configs/13B_train/32768_none_ckpt_True.py | 180 ----------------- .../13B_train/4096_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/4096_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/4096_flash_ckpt_False.py | 180 ----------------- configs/13B_train/4096_flash_ckpt_True.py | 180 ----------------- configs/13B_train/4096_intern_ckpt_False.py | 180 ----------------- configs/13B_train/4096_intern_ckpt_True.py | 180 ----------------- configs/13B_train/4096_megatron_ckpt_False.py | 180 ----------------- configs/13B_train/4096_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/4096_none_ckpt_False.py | 180 ----------------- configs/13B_train/4096_none_ckpt_True.py | 180 ----------------- .../13B_train/65536_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/65536_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/65536_flash_ckpt_False.py | 180 ----------------- configs/13B_train/65536_flash_ckpt_True.py | 180 ----------------- configs/13B_train/65536_intern_ckpt_False.py | 180 ----------------- configs/13B_train/65536_intern_ckpt_True.py | 180 ----------------- .../13B_train/65536_megatron_ckpt_False.py | 180 ----------------- configs/13B_train/65536_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/65536_none_ckpt_False.py | 180 ----------------- configs/13B_train/65536_none_ckpt_True.py | 180 ----------------- .../13B_train/8192_flash-attn_ckpt_False.py | 180 ----------------- .../13B_train/8192_flash-attn_ckpt_True.py | 180 ----------------- configs/13B_train/8192_flash_ckpt_False.py | 180 ----------------- configs/13B_train/8192_flash_ckpt_True.py | 180 ----------------- configs/13B_train/8192_intern_ckpt_False.py | 180 ----------------- configs/13B_train/8192_intern_ckpt_True.py | 180 ----------------- configs/13B_train/8192_megatron_ckpt_False.py | 180 ----------------- configs/13B_train/8192_megatron_ckpt_True.py | 180 ----------------- configs/13B_train/8192_none_ckpt_False.py | 180 ----------------- configs/13B_train/8192_none_ckpt_True.py | 180 ----------------- configs/30B_template.py | 8 +- configs/30B_train/131072_flash_ckpt_False.py | 180 ----------------- configs/30B_train/131072_flash_ckpt_True.py | 180 ----------------- configs/30B_train/131072_intern_ckpt_False.py | 180 ----------------- configs/30B_train/131072_intern_ckpt_True.py | 180 ----------------- .../30B_train/131072_megatron_ckpt_False.py | 180 ----------------- .../30B_train/131072_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/131072_none_ckpt_False.py | 180 ----------------- configs/30B_train/131072_none_ckpt_True.py | 180 ----------------- configs/30B_train/16384_flash_ckpt_False.py | 180 ----------------- configs/30B_train/16384_flash_ckpt_True.py | 180 ----------------- configs/30B_train/16384_intern_ckpt_False.py | 180 ----------------- configs/30B_train/16384_intern_ckpt_True.py | 180 ----------------- .../30B_train/16384_megatron_ckpt_False.py | 180 ----------------- configs/30B_train/16384_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/16384_none_ckpt_False.py | 180 ----------------- configs/30B_train/16384_none_ckpt_True.py | 180 ----------------- configs/30B_train/262144_flash_ckpt_False.py | 180 ----------------- configs/30B_train/262144_flash_ckpt_True.py | 180 ----------------- configs/30B_train/262144_intern_ckpt_False.py | 180 ----------------- configs/30B_train/262144_intern_ckpt_True.py | 180 ----------------- .../30B_train/262144_megatron_ckpt_False.py | 180 ----------------- .../30B_train/262144_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/262144_none_ckpt_False.py | 180 ----------------- configs/30B_train/262144_none_ckpt_True.py | 180 ----------------- configs/30B_train/32768_flash_ckpt_False.py | 180 ----------------- configs/30B_train/32768_flash_ckpt_True.py | 180 ----------------- configs/30B_train/32768_intern_ckpt_False.py | 180 ----------------- configs/30B_train/32768_intern_ckpt_True.py | 180 ----------------- .../30B_train/32768_megatron_ckpt_False.py | 180 ----------------- configs/30B_train/32768_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/32768_none_ckpt_False.py | 180 ----------------- configs/30B_train/32768_none_ckpt_True.py | 180 ----------------- configs/30B_train/4096_flash_ckpt_False.py | 180 ----------------- configs/30B_train/4096_flash_ckpt_True.py | 180 ----------------- configs/30B_train/4096_intern_ckpt_False.py | 180 ----------------- configs/30B_train/4096_intern_ckpt_True.py | 180 ----------------- configs/30B_train/4096_megatron_ckpt_False.py | 180 ----------------- configs/30B_train/4096_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/4096_none_ckpt_False.py | 180 ----------------- configs/30B_train/4096_none_ckpt_True.py | 180 ----------------- configs/30B_train/65536_flash_ckpt_False.py | 180 ----------------- configs/30B_train/65536_flash_ckpt_True.py | 180 ----------------- configs/30B_train/65536_intern_ckpt_False.py | 180 ----------------- configs/30B_train/65536_intern_ckpt_True.py | 180 ----------------- .../30B_train/65536_megatron_ckpt_False.py | 180 ----------------- configs/30B_train/65536_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/65536_none_ckpt_False.py | 180 ----------------- configs/30B_train/65536_none_ckpt_True.py | 180 ----------------- configs/30B_train/8192_flash_ckpt_False.py | 180 ----------------- configs/30B_train/8192_flash_ckpt_True.py | 180 ----------------- configs/30B_train/8192_intern_ckpt_False.py | 180 ----------------- configs/30B_train/8192_intern_ckpt_True.py | 180 ----------------- configs/30B_train/8192_megatron_ckpt_False.py | 180 ----------------- configs/30B_train/8192_megatron_ckpt_True.py | 180 ----------------- configs/30B_train/8192_none_ckpt_False.py | 180 ----------------- configs/30B_train/8192_none_ckpt_True.py | 180 ----------------- configs/7B_template.py | 2 +- .../7B_train/131072_flash-attn_ckpt_False.py | 181 ------------------ .../7B_train/131072_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/131072_flash_ckpt_False.py | 181 ------------------ configs/7B_train/131072_flash_ckpt_True.py | 181 ------------------ configs/7B_train/131072_intern_ckpt_False.py | 181 ------------------ configs/7B_train/131072_intern_ckpt_True.py | 181 ------------------ .../7B_train/131072_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/131072_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/131072_none_ckpt_False.py | 181 ------------------ configs/7B_train/131072_none_ckpt_True.py | 181 ------------------ .../7B_train/16384_flash-attn_ckpt_False.py | 181 ------------------ .../7B_train/16384_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/16384_flash_ckpt_False.py | 181 ------------------ configs/7B_train/16384_flash_ckpt_True.py | 181 ------------------ configs/7B_train/16384_intern_ckpt_False.py | 181 ------------------ configs/7B_train/16384_intern_ckpt_True.py | 181 ------------------ configs/7B_train/16384_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/16384_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/16384_none_ckpt_False.py | 181 ------------------ configs/7B_train/16384_none_ckpt_True.py | 181 ------------------ .../7B_train/262144_flash-attn_ckpt_False.py | 181 ------------------ .../7B_train/262144_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/262144_flash_ckpt_False.py | 181 ------------------ configs/7B_train/262144_flash_ckpt_True.py | 181 ------------------ configs/7B_train/262144_intern_ckpt_False.py | 181 ------------------ configs/7B_train/262144_intern_ckpt_True.py | 181 ------------------ .../7B_train/262144_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/262144_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/262144_none_ckpt_False.py | 181 ------------------ configs/7B_train/262144_none_ckpt_True.py | 181 ------------------ .../7B_train/32768_flash-attn_ckpt_False.py | 181 ------------------ .../7B_train/32768_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/32768_flash_ckpt_False.py | 181 ------------------ configs/7B_train/32768_flash_ckpt_True.py | 181 ------------------ configs/7B_train/32768_intern_ckpt_False.py | 181 ------------------ configs/7B_train/32768_intern_ckpt_True.py | 181 ------------------ configs/7B_train/32768_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/32768_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/32768_none_ckpt_False.py | 181 ------------------ configs/7B_train/32768_none_ckpt_True.py | 181 ------------------ .../7B_train/4096_flash-attn_ckpt_False.py | 181 ------------------ configs/7B_train/4096_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/4096_flash_ckpt_False.py | 181 ------------------ configs/7B_train/4096_flash_ckpt_True.py | 181 ------------------ configs/7B_train/4096_intern_ckpt_False.py | 181 ------------------ configs/7B_train/4096_intern_ckpt_True.py | 181 ------------------ configs/7B_train/4096_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/4096_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/4096_none_ckpt_False.py | 181 ------------------ configs/7B_train/4096_none_ckpt_True.py | 181 ------------------ .../7B_train/65536_flash-attn_ckpt_False.py | 181 ------------------ .../7B_train/65536_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/65536_flash_ckpt_False.py | 181 ------------------ configs/7B_train/65536_flash_ckpt_True.py | 181 ------------------ configs/7B_train/65536_intern_ckpt_False.py | 181 ------------------ configs/7B_train/65536_intern_ckpt_True.py | 181 ------------------ configs/7B_train/65536_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/65536_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/65536_none_ckpt_False.py | 181 ------------------ configs/7B_train/65536_none_ckpt_True.py | 181 ------------------ .../7B_train/8192_flash-attn_ckpt_False.py | 181 ------------------ configs/7B_train/8192_flash-attn_ckpt_True.py | 181 ------------------ configs/7B_train/8192_flash_ckpt_False.py | 181 ------------------ configs/7B_train/8192_flash_ckpt_True.py | 181 ------------------ configs/7B_train/8192_intern_ckpt_False.py | 181 ------------------ configs/7B_train/8192_intern_ckpt_True.py | 181 ------------------ configs/7B_train/8192_megatron_ckpt_False.py | 181 ------------------ configs/7B_train/8192_megatron_ckpt_True.py | 181 ------------------ configs/7B_train/8192_none_ckpt_False.py | 181 ------------------ configs/7B_train/8192_none_ckpt_True.py | 181 ------------------ configs/generate.py | 24 ++- 200 files changed, 33 insertions(+), 35183 deletions(-) delete mode 100644 configs/13B_train/131072_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/131072_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/131072_flash_ckpt_False.py delete mode 100644 configs/13B_train/131072_flash_ckpt_True.py delete mode 100644 configs/13B_train/131072_intern_ckpt_False.py delete mode 100644 configs/13B_train/131072_intern_ckpt_True.py delete mode 100644 configs/13B_train/131072_megatron_ckpt_False.py delete mode 100644 configs/13B_train/131072_megatron_ckpt_True.py delete mode 100644 configs/13B_train/131072_none_ckpt_False.py delete mode 100644 configs/13B_train/131072_none_ckpt_True.py delete mode 100644 configs/13B_train/16384_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/16384_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/16384_flash_ckpt_False.py delete mode 100644 configs/13B_train/16384_flash_ckpt_True.py delete mode 100644 configs/13B_train/16384_intern_ckpt_False.py delete mode 100644 configs/13B_train/16384_intern_ckpt_True.py delete mode 100644 configs/13B_train/16384_megatron_ckpt_False.py delete mode 100644 configs/13B_train/16384_megatron_ckpt_True.py delete mode 100644 configs/13B_train/16384_none_ckpt_False.py delete mode 100644 configs/13B_train/16384_none_ckpt_True.py delete mode 100644 configs/13B_train/262144_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/262144_flash_ckpt_False.py delete mode 100644 configs/13B_train/262144_flash_ckpt_True.py delete mode 100644 configs/13B_train/262144_intern_ckpt_False.py delete mode 100644 configs/13B_train/262144_intern_ckpt_True.py delete mode 100644 configs/13B_train/262144_megatron_ckpt_False.py delete mode 100644 configs/13B_train/262144_megatron_ckpt_True.py delete mode 100644 configs/13B_train/262144_none_ckpt_False.py delete mode 100644 configs/13B_train/262144_none_ckpt_True.py delete mode 100644 configs/13B_train/32768_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/32768_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/32768_flash_ckpt_False.py delete mode 100644 configs/13B_train/32768_flash_ckpt_True.py delete mode 100644 configs/13B_train/32768_intern_ckpt_False.py delete mode 100644 configs/13B_train/32768_intern_ckpt_True.py delete mode 100644 configs/13B_train/32768_megatron_ckpt_False.py delete mode 100644 configs/13B_train/32768_megatron_ckpt_True.py delete mode 100644 configs/13B_train/32768_none_ckpt_False.py delete mode 100644 configs/13B_train/32768_none_ckpt_True.py delete mode 100644 configs/13B_train/4096_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/4096_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/4096_flash_ckpt_False.py delete mode 100644 configs/13B_train/4096_flash_ckpt_True.py delete mode 100644 configs/13B_train/4096_intern_ckpt_False.py delete mode 100644 configs/13B_train/4096_intern_ckpt_True.py delete mode 100644 configs/13B_train/4096_megatron_ckpt_False.py delete mode 100644 configs/13B_train/4096_megatron_ckpt_True.py delete mode 100644 configs/13B_train/4096_none_ckpt_False.py delete mode 100644 configs/13B_train/4096_none_ckpt_True.py delete mode 100644 configs/13B_train/65536_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/65536_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/65536_flash_ckpt_False.py delete mode 100644 configs/13B_train/65536_flash_ckpt_True.py delete mode 100644 configs/13B_train/65536_intern_ckpt_False.py delete mode 100644 configs/13B_train/65536_intern_ckpt_True.py delete mode 100644 configs/13B_train/65536_megatron_ckpt_False.py delete mode 100644 configs/13B_train/65536_megatron_ckpt_True.py delete mode 100644 configs/13B_train/65536_none_ckpt_False.py delete mode 100644 configs/13B_train/65536_none_ckpt_True.py delete mode 100644 configs/13B_train/8192_flash-attn_ckpt_False.py delete mode 100644 configs/13B_train/8192_flash-attn_ckpt_True.py delete mode 100644 configs/13B_train/8192_flash_ckpt_False.py delete mode 100644 configs/13B_train/8192_flash_ckpt_True.py delete mode 100644 configs/13B_train/8192_intern_ckpt_False.py delete mode 100644 configs/13B_train/8192_intern_ckpt_True.py delete mode 100644 configs/13B_train/8192_megatron_ckpt_False.py delete mode 100644 configs/13B_train/8192_megatron_ckpt_True.py delete mode 100644 configs/13B_train/8192_none_ckpt_False.py delete mode 100644 configs/13B_train/8192_none_ckpt_True.py delete mode 100644 configs/30B_train/131072_flash_ckpt_False.py delete mode 100644 configs/30B_train/131072_flash_ckpt_True.py delete mode 100644 configs/30B_train/131072_intern_ckpt_False.py delete mode 100644 configs/30B_train/131072_intern_ckpt_True.py delete mode 100644 configs/30B_train/131072_megatron_ckpt_False.py delete mode 100644 configs/30B_train/131072_megatron_ckpt_True.py delete mode 100644 configs/30B_train/131072_none_ckpt_False.py delete mode 100644 configs/30B_train/131072_none_ckpt_True.py delete mode 100644 configs/30B_train/16384_flash_ckpt_False.py delete mode 100644 configs/30B_train/16384_flash_ckpt_True.py delete mode 100644 configs/30B_train/16384_intern_ckpt_False.py delete mode 100644 configs/30B_train/16384_intern_ckpt_True.py delete mode 100644 configs/30B_train/16384_megatron_ckpt_False.py delete mode 100644 configs/30B_train/16384_megatron_ckpt_True.py delete mode 100644 configs/30B_train/16384_none_ckpt_False.py delete mode 100644 configs/30B_train/16384_none_ckpt_True.py delete mode 100644 configs/30B_train/262144_flash_ckpt_False.py delete mode 100644 configs/30B_train/262144_flash_ckpt_True.py delete mode 100644 configs/30B_train/262144_intern_ckpt_False.py delete mode 100644 configs/30B_train/262144_intern_ckpt_True.py delete mode 100644 configs/30B_train/262144_megatron_ckpt_False.py delete mode 100644 configs/30B_train/262144_megatron_ckpt_True.py delete mode 100644 configs/30B_train/262144_none_ckpt_False.py delete mode 100644 configs/30B_train/262144_none_ckpt_True.py delete mode 100644 configs/30B_train/32768_flash_ckpt_False.py delete mode 100644 configs/30B_train/32768_flash_ckpt_True.py delete mode 100644 configs/30B_train/32768_intern_ckpt_False.py delete mode 100644 configs/30B_train/32768_intern_ckpt_True.py delete mode 100644 configs/30B_train/32768_megatron_ckpt_False.py delete mode 100644 configs/30B_train/32768_megatron_ckpt_True.py delete mode 100644 configs/30B_train/32768_none_ckpt_False.py delete mode 100644 configs/30B_train/32768_none_ckpt_True.py delete mode 100644 configs/30B_train/4096_flash_ckpt_False.py delete mode 100644 configs/30B_train/4096_flash_ckpt_True.py delete mode 100644 configs/30B_train/4096_intern_ckpt_False.py delete mode 100644 configs/30B_train/4096_intern_ckpt_True.py delete mode 100644 configs/30B_train/4096_megatron_ckpt_False.py delete mode 100644 configs/30B_train/4096_megatron_ckpt_True.py delete mode 100644 configs/30B_train/4096_none_ckpt_False.py delete mode 100644 configs/30B_train/4096_none_ckpt_True.py delete mode 100644 configs/30B_train/65536_flash_ckpt_False.py delete mode 100644 configs/30B_train/65536_flash_ckpt_True.py delete mode 100644 configs/30B_train/65536_intern_ckpt_False.py delete mode 100644 configs/30B_train/65536_intern_ckpt_True.py delete mode 100644 configs/30B_train/65536_megatron_ckpt_False.py delete mode 100644 configs/30B_train/65536_megatron_ckpt_True.py delete mode 100644 configs/30B_train/65536_none_ckpt_False.py delete mode 100644 configs/30B_train/65536_none_ckpt_True.py delete mode 100644 configs/30B_train/8192_flash_ckpt_False.py delete mode 100644 configs/30B_train/8192_flash_ckpt_True.py delete mode 100644 configs/30B_train/8192_intern_ckpt_False.py delete mode 100644 configs/30B_train/8192_intern_ckpt_True.py delete mode 100644 configs/30B_train/8192_megatron_ckpt_False.py delete mode 100644 configs/30B_train/8192_megatron_ckpt_True.py delete mode 100644 configs/30B_train/8192_none_ckpt_False.py delete mode 100644 configs/30B_train/8192_none_ckpt_True.py delete mode 100644 configs/7B_train/131072_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/131072_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/131072_flash_ckpt_False.py delete mode 100644 configs/7B_train/131072_flash_ckpt_True.py delete mode 100644 configs/7B_train/131072_intern_ckpt_False.py delete mode 100644 configs/7B_train/131072_intern_ckpt_True.py delete mode 100644 configs/7B_train/131072_megatron_ckpt_False.py delete mode 100644 configs/7B_train/131072_megatron_ckpt_True.py delete mode 100644 configs/7B_train/131072_none_ckpt_False.py delete mode 100644 configs/7B_train/131072_none_ckpt_True.py delete mode 100644 configs/7B_train/16384_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/16384_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/16384_flash_ckpt_False.py delete mode 100644 configs/7B_train/16384_flash_ckpt_True.py delete mode 100644 configs/7B_train/16384_intern_ckpt_False.py delete mode 100644 configs/7B_train/16384_intern_ckpt_True.py delete mode 100644 configs/7B_train/16384_megatron_ckpt_False.py delete mode 100644 configs/7B_train/16384_megatron_ckpt_True.py delete mode 100644 configs/7B_train/16384_none_ckpt_False.py delete mode 100644 configs/7B_train/16384_none_ckpt_True.py delete mode 100644 configs/7B_train/262144_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/262144_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/262144_flash_ckpt_False.py delete mode 100644 configs/7B_train/262144_flash_ckpt_True.py delete mode 100644 configs/7B_train/262144_intern_ckpt_False.py delete mode 100644 configs/7B_train/262144_intern_ckpt_True.py delete mode 100644 configs/7B_train/262144_megatron_ckpt_False.py delete mode 100644 configs/7B_train/262144_megatron_ckpt_True.py delete mode 100644 configs/7B_train/262144_none_ckpt_False.py delete mode 100644 configs/7B_train/262144_none_ckpt_True.py delete mode 100644 configs/7B_train/32768_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/32768_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/32768_flash_ckpt_False.py delete mode 100644 configs/7B_train/32768_flash_ckpt_True.py delete mode 100644 configs/7B_train/32768_intern_ckpt_False.py delete mode 100644 configs/7B_train/32768_intern_ckpt_True.py delete mode 100644 configs/7B_train/32768_megatron_ckpt_False.py delete mode 100644 configs/7B_train/32768_megatron_ckpt_True.py delete mode 100644 configs/7B_train/32768_none_ckpt_False.py delete mode 100644 configs/7B_train/32768_none_ckpt_True.py delete mode 100644 configs/7B_train/4096_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/4096_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/4096_flash_ckpt_False.py delete mode 100644 configs/7B_train/4096_flash_ckpt_True.py delete mode 100644 configs/7B_train/4096_intern_ckpt_False.py delete mode 100644 configs/7B_train/4096_intern_ckpt_True.py delete mode 100644 configs/7B_train/4096_megatron_ckpt_False.py delete mode 100644 configs/7B_train/4096_megatron_ckpt_True.py delete mode 100644 configs/7B_train/4096_none_ckpt_False.py delete mode 100644 configs/7B_train/4096_none_ckpt_True.py delete mode 100644 configs/7B_train/65536_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/65536_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/65536_flash_ckpt_False.py delete mode 100644 configs/7B_train/65536_flash_ckpt_True.py delete mode 100644 configs/7B_train/65536_intern_ckpt_False.py delete mode 100644 configs/7B_train/65536_intern_ckpt_True.py delete mode 100644 configs/7B_train/65536_megatron_ckpt_False.py delete mode 100644 configs/7B_train/65536_megatron_ckpt_True.py delete mode 100644 configs/7B_train/65536_none_ckpt_False.py delete mode 100644 configs/7B_train/65536_none_ckpt_True.py delete mode 100644 configs/7B_train/8192_flash-attn_ckpt_False.py delete mode 100644 configs/7B_train/8192_flash-attn_ckpt_True.py delete mode 100644 configs/7B_train/8192_flash_ckpt_False.py delete mode 100644 configs/7B_train/8192_flash_ckpt_True.py delete mode 100644 configs/7B_train/8192_intern_ckpt_False.py delete mode 100644 configs/7B_train/8192_intern_ckpt_True.py delete mode 100644 configs/7B_train/8192_megatron_ckpt_False.py delete mode 100644 configs/7B_train/8192_megatron_ckpt_True.py delete mode 100644 configs/7B_train/8192_none_ckpt_False.py delete mode 100644 configs/7B_train/8192_none_ckpt_True.py diff --git a/.gitignore b/.gitignore index 04367e3d..9bdc7ec7 100644 --- a/.gitignore +++ b/.gitignore @@ -149,5 +149,9 @@ memory_trace 13b_train*/ 30b_train*/ fstp_logs/ +configs/7B_train/* +configs/13B_train/* +configs/30B_train/* + atb pip diff --git a/configs/13B_template.py b/configs/13B_template.py index 26be3f71..e0e016cc 100644 --- a/configs/13B_template.py +++ b/configs/13B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 5120 NUM_ATTENTION_HEAD = 40 MLP_RATIO = 8 / 3 @@ -50,9 +50,9 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, + micro_num=1, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, + micro_bsz=1, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -91,7 +91,7 @@ hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py deleted file mode 100644 index 28d51af6..00000000 --- a/configs/13B_train/131072_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py deleted file mode 100644 index 6d1b7ef0..00000000 --- a/configs/13B_train/131072_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py deleted file mode 100644 index dd0f0e89..00000000 --- a/configs/13B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py deleted file mode 100644 index 2b9276db..00000000 --- a/configs/13B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 182e4ddb..00000000 --- a/configs/13B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py deleted file mode 100644 index c23a3c10..00000000 --- a/configs/13B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 935ff98d..00000000 --- a/configs/13B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index 441166c2..00000000 --- a/configs/13B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py deleted file mode 100644 index e43d6044..00000000 --- a/configs/13B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py deleted file mode 100644 index 0945dbdc..00000000 --- a/configs/13B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py deleted file mode 100644 index 393e54d3..00000000 --- a/configs/13B_train/16384_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py deleted file mode 100644 index 7f7e7ac6..00000000 --- a/configs/13B_train/16384_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py deleted file mode 100644 index cadd215f..00000000 --- a/configs/13B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py deleted file mode 100644 index c60ea730..00000000 --- a/configs/13B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py deleted file mode 100644 index e5d6fa6b..00000000 --- a/configs/13B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py deleted file mode 100644 index 6ac47ac2..00000000 --- a/configs/13B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index 24429ead..00000000 --- a/configs/13B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index d79c8207..00000000 --- a/configs/13B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py deleted file mode 100644 index a30d713a..00000000 --- a/configs/13B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py deleted file mode 100644 index 76483257..00000000 --- a/configs/13B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py deleted file mode 100644 index fd0be6a7..00000000 --- a/configs/13B_train/262144_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 5ca332ef..00000000 --- a/configs/13B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py deleted file mode 100644 index f990655a..00000000 --- a/configs/13B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py deleted file mode 100644 index 7ebcf94f..00000000 --- a/configs/13B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py deleted file mode 100644 index e958ac06..00000000 --- a/configs/13B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index 31e96f78..00000000 --- a/configs/13B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 2339244b..00000000 --- a/configs/13B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py deleted file mode 100644 index 41d55e91..00000000 --- a/configs/13B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py deleted file mode 100644 index 4f2da605..00000000 --- a/configs/13B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py deleted file mode 100644 index 3eb0f493..00000000 --- a/configs/13B_train/32768_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py deleted file mode 100644 index 26b06ef3..00000000 --- a/configs/13B_train/32768_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py deleted file mode 100644 index da30a4dd..00000000 --- a/configs/13B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py deleted file mode 100644 index 20d415a5..00000000 --- a/configs/13B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py deleted file mode 100644 index 05ab5285..00000000 --- a/configs/13B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py deleted file mode 100644 index 273a812d..00000000 --- a/configs/13B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index c8db542d..00000000 --- a/configs/13B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index 9ff56012..00000000 --- a/configs/13B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py deleted file mode 100644 index a02e0711..00000000 --- a/configs/13B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py deleted file mode 100644 index b9b17e3c..00000000 --- a/configs/13B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py deleted file mode 100644 index 8e4459ea..00000000 --- a/configs/13B_train/4096_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py deleted file mode 100644 index a8f5e39b..00000000 --- a/configs/13B_train/4096_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 517b46e4..00000000 --- a/configs/13B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py deleted file mode 100644 index eacfcdfd..00000000 --- a/configs/13B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py deleted file mode 100644 index 5ecf2d66..00000000 --- a/configs/13B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py deleted file mode 100644 index b70acb01..00000000 --- a/configs/13B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index 2e847a64..00000000 --- a/configs/13B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index d8ba2c57..00000000 --- a/configs/13B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py deleted file mode 100644 index f8bbdfc5..00000000 --- a/configs/13B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py deleted file mode 100644 index d8f8ec7e..00000000 --- a/configs/13B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py deleted file mode 100644 index 09367f5a..00000000 --- a/configs/13B_train/65536_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py deleted file mode 100644 index dc283a92..00000000 --- a/configs/13B_train/65536_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py deleted file mode 100644 index 482d5114..00000000 --- a/configs/13B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py deleted file mode 100644 index 66051f83..00000000 --- a/configs/13B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py deleted file mode 100644 index f829652a..00000000 --- a/configs/13B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py deleted file mode 100644 index 4e94d0e3..00000000 --- a/configs/13B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index a9293334..00000000 --- a/configs/13B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index 845e32bc..00000000 --- a/configs/13B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py deleted file mode 100644 index 52ce3c52..00000000 --- a/configs/13B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py deleted file mode 100644 index de5532e1..00000000 --- a/configs/13B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py deleted file mode 100644 index 3324c290..00000000 --- a/configs/13B_train/8192_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py deleted file mode 100644 index 317e0f32..00000000 --- a/configs/13B_train/8192_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py deleted file mode 100644 index d645dc1b..00000000 --- a/configs/13B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py deleted file mode 100644 index 425859c0..00000000 --- a/configs/13B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py deleted file mode 100644 index 0b4fb8a2..00000000 --- a/configs/13B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py deleted file mode 100644 index b42cb769..00000000 --- a/configs/13B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index e2191937..00000000 --- a/configs/13B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index 5123c412..00000000 --- a/configs/13B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py deleted file mode 100644 index c9d9c050..00000000 --- a/configs/13B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py deleted file mode 100644 index 182ec21f..00000000 --- a/configs/13B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_template.py b/configs/30B_template.py index 7a32015e..4ac99bf0 100644 --- a/configs/30B_template.py +++ b/configs/30B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 6144 NUM_ATTENTION_HEAD = 48 MLP_RATIO = 8 / 3 @@ -50,9 +50,9 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, + micro_num=1, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, + micro_bsz=1, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -91,7 +91,7 @@ hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py deleted file mode 100644 index 3af48f3e..00000000 --- a/configs/30B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py deleted file mode 100644 index 4bd249bc..00000000 --- a/configs/30B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 77b176d2..00000000 --- a/configs/30B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py deleted file mode 100644 index 38a1db3b..00000000 --- a/configs/30B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 49879303..00000000 --- a/configs/30B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index d911d381..00000000 --- a/configs/30B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py deleted file mode 100644 index 78b3c9a8..00000000 --- a/configs/30B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py deleted file mode 100644 index 941279e7..00000000 --- a/configs/30B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py deleted file mode 100644 index 779a10bc..00000000 --- a/configs/30B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py deleted file mode 100644 index 0498e2c4..00000000 --- a/configs/30B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py deleted file mode 100644 index 309a33f0..00000000 --- a/configs/30B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py deleted file mode 100644 index 23c977a5..00000000 --- a/configs/30B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index 8576aa76..00000000 --- a/configs/30B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index 460aba3b..00000000 --- a/configs/30B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py deleted file mode 100644 index 4ca50666..00000000 --- a/configs/30B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py deleted file mode 100644 index c7987e0d..00000000 --- a/configs/30B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 10d71d9c..00000000 --- a/configs/30B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py deleted file mode 100644 index a1990dbb..00000000 --- a/configs/30B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py deleted file mode 100644 index f8ec6a2f..00000000 --- a/configs/30B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py deleted file mode 100644 index c5afa46b..00000000 --- a/configs/30B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index 412da179..00000000 --- a/configs/30B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 79affb19..00000000 --- a/configs/30B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py deleted file mode 100644 index e6fbe1eb..00000000 --- a/configs/30B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py deleted file mode 100644 index d507c30b..00000000 --- a/configs/30B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py deleted file mode 100644 index 6bac5b31..00000000 --- a/configs/30B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py deleted file mode 100644 index f21c9983..00000000 --- a/configs/30B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py deleted file mode 100644 index 79728d64..00000000 --- a/configs/30B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py deleted file mode 100644 index 6dc24c30..00000000 --- a/configs/30B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index 37fd0986..00000000 --- a/configs/30B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index 986b27dd..00000000 --- a/configs/30B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py deleted file mode 100644 index 9c6ca879..00000000 --- a/configs/30B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py deleted file mode 100644 index d4ab7f2d..00000000 --- a/configs/30B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 3dd8be56..00000000 --- a/configs/30B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py deleted file mode 100644 index 73150acf..00000000 --- a/configs/30B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py deleted file mode 100644 index cff6c5b6..00000000 --- a/configs/30B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py deleted file mode 100644 index 1fb64257..00000000 --- a/configs/30B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index 79f718d0..00000000 --- a/configs/30B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index 502ae7f7..00000000 --- a/configs/30B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py deleted file mode 100644 index 981a0f23..00000000 --- a/configs/30B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py deleted file mode 100644 index dddea663..00000000 --- a/configs/30B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py deleted file mode 100644 index babebd95..00000000 --- a/configs/30B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py deleted file mode 100644 index 064250e7..00000000 --- a/configs/30B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py deleted file mode 100644 index 64165f44..00000000 --- a/configs/30B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py deleted file mode 100644 index 78b66213..00000000 --- a/configs/30B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index e8c09548..00000000 --- a/configs/30B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index d3b64c41..00000000 --- a/configs/30B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py deleted file mode 100644 index ee4c7fb5..00000000 --- a/configs/30B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py deleted file mode 100644 index 2e84144c..00000000 --- a/configs/30B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py deleted file mode 100644 index b9eb6e65..00000000 --- a/configs/30B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py deleted file mode 100644 index c0dd5175..00000000 --- a/configs/30B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py deleted file mode 100644 index d915b6b8..00000000 --- a/configs/30B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py deleted file mode 100644 index a71693a1..00000000 --- a/configs/30B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index dcacb9e5..00000000 --- a/configs/30B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index b6e4ba24..00000000 --- a/configs/30B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py deleted file mode 100644 index ce790dfa..00000000 --- a/configs/30B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py deleted file mode 100644 index e6afcd4e..00000000 --- a/configs/30B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_template.py b/configs/7B_template.py index b9f76a51..d78fc884 100644 --- a/configs/7B_template.py +++ b/configs/7B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py deleted file mode 100644 index 047fb372..00000000 --- a/configs/7B_train/131072_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py deleted file mode 100644 index 763627d6..00000000 --- a/configs/7B_train/131072_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py deleted file mode 100644 index 4307e9d1..00000000 --- a/configs/7B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py deleted file mode 100644 index c110b256..00000000 --- a/configs/7B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 1d728be7..00000000 --- a/configs/7B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py deleted file mode 100644 index 45d4aa01..00000000 --- a/configs/7B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 0bd98459..00000000 --- a/configs/7B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index 9200afbe..00000000 --- a/configs/7B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py deleted file mode 100644 index 16059fb1..00000000 --- a/configs/7B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py deleted file mode 100644 index 35b3f08e..00000000 --- a/configs/7B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py deleted file mode 100644 index 53a64b99..00000000 --- a/configs/7B_train/16384_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py deleted file mode 100644 index cdb051e5..00000000 --- a/configs/7B_train/16384_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py deleted file mode 100644 index 41b39515..00000000 --- a/configs/7B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py deleted file mode 100644 index ca2c7f06..00000000 --- a/configs/7B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py deleted file mode 100644 index 93abb682..00000000 --- a/configs/7B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py deleted file mode 100644 index af9d9945..00000000 --- a/configs/7B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index d2c58d3a..00000000 --- a/configs/7B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index 6e372b8c..00000000 --- a/configs/7B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py deleted file mode 100644 index 0fd65900..00000000 --- a/configs/7B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py deleted file mode 100644 index 6ea5e1a9..00000000 --- a/configs/7B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py deleted file mode 100644 index 6dad9730..00000000 --- a/configs/7B_train/262144_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py deleted file mode 100644 index cacd9737..00000000 --- a/configs/7B_train/262144_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 0e9b0173..00000000 --- a/configs/7B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py deleted file mode 100644 index ddacc8df..00000000 --- a/configs/7B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py deleted file mode 100644 index e5cf7694..00000000 --- a/configs/7B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py deleted file mode 100644 index 76f9386a..00000000 --- a/configs/7B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index b929f9a6..00000000 --- a/configs/7B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 1655631c..00000000 --- a/configs/7B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py deleted file mode 100644 index 85512f07..00000000 --- a/configs/7B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py deleted file mode 100644 index fef559bd..00000000 --- a/configs/7B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py deleted file mode 100644 index f2664be8..00000000 --- a/configs/7B_train/32768_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py deleted file mode 100644 index 232b5904..00000000 --- a/configs/7B_train/32768_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py deleted file mode 100644 index 878b9ac1..00000000 --- a/configs/7B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py deleted file mode 100644 index 27cffd02..00000000 --- a/configs/7B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py deleted file mode 100644 index fcf84197..00000000 --- a/configs/7B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py deleted file mode 100644 index aec2b68b..00000000 --- a/configs/7B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index 64caeeb5..00000000 --- a/configs/7B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index a736e7d0..00000000 --- a/configs/7B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py deleted file mode 100644 index 3a31776e..00000000 --- a/configs/7B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py deleted file mode 100644 index 4ac09249..00000000 --- a/configs/7B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py deleted file mode 100644 index b3de8990..00000000 --- a/configs/7B_train/4096_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py deleted file mode 100644 index b44b103f..00000000 --- a/configs/7B_train/4096_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 8ac542d6..00000000 --- a/configs/7B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py deleted file mode 100644 index ec477f68..00000000 --- a/configs/7B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py deleted file mode 100644 index f16f95ad..00000000 --- a/configs/7B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py deleted file mode 100644 index 90fed7c8..00000000 --- a/configs/7B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index ca41fa28..00000000 --- a/configs/7B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index 45183156..00000000 --- a/configs/7B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py deleted file mode 100644 index c81bb5b9..00000000 --- a/configs/7B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py deleted file mode 100644 index a25d222f..00000000 --- a/configs/7B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py deleted file mode 100644 index 3d5a81eb..00000000 --- a/configs/7B_train/65536_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py deleted file mode 100644 index c6982c98..00000000 --- a/configs/7B_train/65536_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py deleted file mode 100644 index 0cfea813..00000000 --- a/configs/7B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py deleted file mode 100644 index abdeb49d..00000000 --- a/configs/7B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py deleted file mode 100644 index 2e0b27e1..00000000 --- a/configs/7B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py deleted file mode 100644 index d1a8de7c..00000000 --- a/configs/7B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index 7de7b92d..00000000 --- a/configs/7B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index b339c833..00000000 --- a/configs/7B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py deleted file mode 100644 index b8c44769..00000000 --- a/configs/7B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py deleted file mode 100644 index b907e437..00000000 --- a/configs/7B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py deleted file mode 100644 index d0ddd438..00000000 --- a/configs/7B_train/8192_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py deleted file mode 100644 index d9e5b2f9..00000000 --- a/configs/7B_train/8192_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py deleted file mode 100644 index 69546d11..00000000 --- a/configs/7B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py deleted file mode 100644 index 4c7f9864..00000000 --- a/configs/7B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py deleted file mode 100644 index 9694ad81..00000000 --- a/configs/7B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py deleted file mode 100644 index 99a0fc18..00000000 --- a/configs/7B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index f18ee730..00000000 --- a/configs/7B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index 1db58412..00000000 --- a/configs/7B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py deleted file mode 100644 index 95d686bb..00000000 --- a/configs/7B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py deleted file mode 100644 index a63b6f20..00000000 --- a/configs/7B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/generate.py b/configs/generate.py index 6a58f098..a8a5898a 100644 --- a/configs/generate.py +++ b/configs/generate.py @@ -6,8 +6,8 @@ root_names = ["7B_train_", "13B_train_", "30B_train_"] model_size = ["7B", "13B", "30B"] seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144] -sp = ["none", "megatron", "flash-attn", "intern"] -intern_overlap = [False, False, False, True] +sp = ["none", "megatron", "flash-attn", "intern", "intern"] +intern_overlap = [False, False, False, True, False] checkpoint = [False, True] for idx, root_name in enumerate(root_names): @@ -32,13 +32,29 @@ line = line.replace("{sp}", f"\"{sp_mode}\"") line = line.replace("{intern_overlap}", str(intern_overlap[i])) line = line.replace("{checkpoint}", str(ckpt)) - output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py" + output_file_name = str(seq) + "_" + str(sp_mode) + "_overlap_" + str(intern_overlap[i]) + "_ckpt_" + str(ckpt) + ".py" write_file = folder_path + "/" + output_file_name with open(write_file, "w") as file: file.write(line) log_name = root_name + "_" + output_file_name[:-3] - command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" + skip = True + + if idx == 0 and i == 4: # 7b, intern_overlap = False + skip = False + if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True + skip = False + if idx == 1: # 13b + skip = False + if idx == 2: # 30b + skip = False + + if skip: + import time; time.sleep(1) + print(f"skip {log_name}", flush=True) + continue + + command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" process = subprocess.Popen(command, shell=True, executable='/bin/bash') process.wait() \ No newline at end of file From 918dff72579baeb205ed8dc47bce9a2d7aba2c7d Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 25 Oct 2023 13:47:19 +0800 Subject: [PATCH 055/153] reset moe --- internlm/model/moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/internlm/model/moe.py b/internlm/model/moe.py index 0865097f..28e5ae6e 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -53,6 +53,7 @@ def __init__( device=None, dtype=None, ): + super().__init__() assert ( From 363275b500e907cd21e25db4e3bcc54d6acabaf0 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 25 Oct 2023 14:31:00 +0800 Subject: [PATCH 056/153] add memory print --- internlm/model/overlap_handler.py | 3 ++- train.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 35d8a594..d2fef8db 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -316,7 +316,8 @@ def before_forward(self, scheduler, inputs) -> None: self._overlap_handler.set_forward_mode(True) def after_forward(self, scheduler, outputs) -> None: - pass + print("after forward allocated memory: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True) + print("after forward max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) def before_criterion(self, scheduler, outputs, label) -> None: pass diff --git a/train.py b/train.py index ae867287..e1b8dffd 100644 --- a/train.py +++ b/train.py @@ -255,6 +255,8 @@ def main(args): # update parameters, and returns (success_update, grad_norm) trainer_result = trainer.step() assert trainer_result is not None + print("after step: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True) + print("after step: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) success_update, grad_norm_groups = trainer_result if success_update: # update parameters successfully From cc20fa271a74bd792476bfb96c3be18660580c1a Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 25 Oct 2023 16:48:02 +0800 Subject: [PATCH 057/153] reset print memory --- internlm/model/overlap_handler.py | 3 +-- train.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index d2fef8db..35d8a594 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -316,8 +316,7 @@ def before_forward(self, scheduler, inputs) -> None: self._overlap_handler.set_forward_mode(True) def after_forward(self, scheduler, outputs) -> None: - print("after forward allocated memory: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True) - print("after forward max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) + pass def before_criterion(self, scheduler, outputs, label) -> None: pass diff --git a/train.py b/train.py index e1b8dffd..ae867287 100644 --- a/train.py +++ b/train.py @@ -255,8 +255,6 @@ def main(args): # update parameters, and returns (success_update, grad_norm) trainer_result = trainer.step() assert trainer_result is not None - print("after step: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True) - print("after step: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True) success_update, grad_norm_groups = trainer_result if success_update: # update parameters successfully From d831ddcc1d44a1ed4c8710a2f383529d31a6dc9d Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 26 Oct 2023 17:41:17 +0800 Subject: [PATCH 058/153] modify the config --- configs/13B_template.py | 4 ++-- configs/30B_template.py | 6 +++--- configs/7B_sft.py | 4 ++-- configs/generate.py | 19 ++----------------- train.py | 5 +++-- 5 files changed, 12 insertions(+), 26 deletions(-) diff --git a/configs/13B_template.py b/configs/13B_template.py index e0e016cc..849c5aa9 100644 --- a/configs/13B_template.py +++ b/configs/13B_template.py @@ -57,7 +57,7 @@ valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=False, + pack_sample_into_one=True, total_steps=20, skip_batches="", rampup_batch_size="", @@ -65,7 +65,7 @@ min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) diff --git a/configs/30B_template.py b/configs/30B_template.py index 4ac99bf0..d19ece6e 100644 --- a/configs/30B_template.py +++ b/configs/30B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) +JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 6144 NUM_ATTENTION_HEAD = 48 MLP_RATIO = 8 / 3 @@ -57,7 +57,7 @@ valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=False, + pack_sample_into_one=True, total_steps=20, skip_batches="", rampup_batch_size="", @@ -65,7 +65,7 @@ min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 4f482656..2d6a3bee 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -57,7 +57,7 @@ # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=True, - total_steps=20, + total_steps=50, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/configs/generate.py b/configs/generate.py index a8a5898a..038998c7 100644 --- a/configs/generate.py +++ b/configs/generate.py @@ -39,22 +39,7 @@ log_name = root_name + "_" + output_file_name[:-3] - skip = True - - if idx == 0 and i == 4: # 7b, intern_overlap = False - skip = False - if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True - skip = False - if idx == 1: # 13b - skip = False - if idx == 2: # 30b - skip = False - - if skip: - import time; time.sleep(1) - print(f"skip {log_name}", flush=True) - continue - - command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" + print(log_name) + command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" process = subprocess.Popen(command, shell=True, executable='/bin/bash') process.wait() \ No newline at end of file diff --git a/train.py b/train.py index ae867287..f4195964 100644 --- a/train.py +++ b/train.py @@ -309,8 +309,9 @@ def main(args): if memory_profiler is not None: memory_profiler.step() - - prof.step() + + if batch_count % 2 == 0: + prof.step() if gpc.fstp_handler is not None: gpc.fstp_handler.clear_memory_pool() From cbd4f042447ec71ee9523fd0ad646d3c074848cf Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 26 Oct 2023 20:04:01 +0800 Subject: [PATCH 059/153] add synchronize --- internlm/solver/optimizer/hybrid_zero_optim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 6f983f3e..19a79bfd 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -856,7 +856,7 @@ def broadcast_params(self): for handle in handles: handle.wait() - torch.cuda().synchronize() + torch.cuda.synchronize() ################## # FP16 Utilities # From 3253cbf48ef23c7e67e340533c16e1a372579f8e Mon Sep 17 00:00:00 2001 From: mwiacx <759046501@qq.com> Date: Thu, 26 Oct 2023 20:21:46 +0800 Subject: [PATCH 060/153] add a new get_tflops_func --- internlm/utils/common.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/internlm/utils/common.py b/internlm/utils/common.py index f3b58c0c..188a634d 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -220,6 +220,43 @@ def get_megatron_flops( return tflops +def get_megatron_flops_2( + elapsed_time_per_iter, + checkpoint=False, + seq_len=2048, + hidden_size=12, + num_layers=32, + vocab_size=12, + global_batch_size=4, + global_world_size=1, + mlp_ratio=4, + use_swiglu=True, +): + """ + Calc flops based on the paper of Megatron https://deepakn94.github.io/assets/papers/megatron-sc21.pdf + """ + + checkpoint_activations_factor = 4 if checkpoint else 3 + flashattn_activations_factor = 4.5 if checkpoint else 3.5 + + if use_swiglu: + mlp_ratio = mlp_ratio * 3 / 2 + + flops_per_iteration = ( + checkpoint_activations_factor + * (8 + mlp_ratio * 4) + * global_batch_size + * seq_len + * hidden_size**2 + * num_layers + + 4 * global_batch_size * seq_len**2 * hidden_size * num_layers * flashattn_activations_factor + + 6 * global_batch_size * seq_len * hidden_size * vocab_size + ) + + tflops = flops_per_iteration / (elapsed_time_per_iter * global_world_size * (10**12)) + return tflops + + class DummyProfile: """ Dummy Profile. From 8aefb74e02d6083d308a15b4d90309a24e1a093b Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 26 Oct 2023 20:33:12 +0800 Subject: [PATCH 061/153] add flash tflops --- internlm/train/training_internlm.py | 13 +++++++++++++ train.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index df3fa88d..a4b2e598 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -406,11 +406,13 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): tgs_list = [] tflops_list = [] +tflops_list_2 = [] @llm_timeout(func_name="record_current_batch_training_metrics") def record_current_batch_training_metrics( get_tflops_func, + get_tflops_func_2, logger, writer, success_update, @@ -495,6 +497,7 @@ def record_current_batch_training_metrics( tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2) tflops = get_tflops_func((time.time() - start_time)) + tflops_2 = get_tflops_func_2((time.time() - start_time)) tgs_origin = round( num_tokens_in_batch @@ -506,6 +509,7 @@ def record_current_batch_training_metrics( infos = { "tflops": tflops, + "tflops2": tflops_2, "step": batch_count, "loss": loss.item() - moe_loss.item() if moe_loss is not None else loss.item(), "tgs (tokens/gpu/second)": tgs_origin, @@ -599,6 +603,7 @@ def record_current_batch_training_metrics( if batch_count >= 5: tgs_list.append(tgs_origin) tflops_list.append(tflops) + tflops_list_2.append(tflops_2) if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) avg_tgs = sum(tgs_list) / len(tgs_list) @@ -606,9 +611,17 @@ def record_current_batch_training_metrics( if abs(tgs - avg_tgs) > 400: tgs_list.remove(tgs) print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True) + print(tflops_list, flush=True) avg_tflops = sum(tflops_list) / len(tflops_list) for tf in tflops_list.copy(): if abs(tf - avg_tflops) > 10: tflops_list.remove(tf) print(f"avg_tflops: {sum(tflops_list)/len(tflops_list)}", flush=True) + + print(tflops_list_2, flush=True) + avg_tflops_2 = sum(tflops_list_2) / len(tflops_list_2) + for tf in tflops_list_2.copy(): + if abs(tf - avg_tflops_2) > 10: + tflops_list_2.remove(tf) + print(f"avg_tflops: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True) diff --git a/train.py b/train.py index f4195964..45117623 100644 --- a/train.py +++ b/train.py @@ -33,6 +33,7 @@ from internlm.utils.common import ( BatchSkipper, get_megatron_flops, + get_megatron_flops_2, launch_time, parse_args, ) @@ -111,6 +112,18 @@ def main(args): global_world_size=gpc.get_world_size(ParallelMode.GLOBAL), mlp_ratio=gpc.config.MLP_RATIO, ) + + get_tflops_func_2 = partial( + get_megatron_flops_2, + checkpoint=gpc.config.model.checkpoint, + seq_len=gpc.config.SEQ_LEN, + hidden_size=gpc.config.model.hidden_size, + num_layers=gpc.config.model.num_layers, + vocab_size=gpc.config.model.vocab_size, + global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA), + global_world_size=gpc.get_world_size(ParallelMode.GLOBAL), + mlp_ratio=gpc.config.MLP_RATIO, + ) # get and broadcast current time current_time = launch_time() @@ -271,6 +284,7 @@ def main(args): # calculate and record the training metrics, eg. loss, accuracy and so on. record_current_batch_training_metrics( get_tflops_func=get_tflops_func, + get_tflops_func_2=get_tflops_func_2, logger=logger, writer=writer, success_update=success_update, From aa3840fc3853185b03213fe43459f19d5fb80d53 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 26 Oct 2023 20:42:24 +0800 Subject: [PATCH 062/153] fix some bugs --- internlm/train/training_internlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index a4b2e598..2b806926 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -624,4 +624,4 @@ def record_current_batch_training_metrics( for tf in tflops_list_2.copy(): if abs(tf - avg_tflops_2) > 10: tflops_list_2.remove(tf) - print(f"avg_tflops: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True) + print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True) From 3778c666603601c66c558870d43835d5bd88bbba Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 27 Oct 2023 20:04:23 +0800 Subject: [PATCH 063/153] feat(model/overlap_handler.py): fix overlap hander to support pp(non-interleaved) --- internlm/model/overlap_handler.py | 40 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 35d8a594..8462def4 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -75,6 +75,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non if child.bias is not None: setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + self.num_blocks = len(self.index_to_fstp_modules) + self._initialize_memory_pool() self._register_sync_parameters_hook() @@ -219,15 +221,25 @@ def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # pylint: d self._all_gather_block_weight_memory_pool(block_index - 1) else: # start the all-gather for next block - if block_index + 1 < gpc.config.NUM_LAYER: + if block_index + 1 < self.num_blocks: self._all_gather_block_weight_memory_pool(block_index + 1) def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: disable=W0613 - handle = self.fstp_global_handle[module] - handle.wait() - if module.bias is not None: - bias_handle = self.bias_global_handle[module] - bias_handle.wait() + if module in self.fstp_global_handle: + handle = self.fstp_global_handle[module] + handle.wait() + if module.bias is not None: + bias_handle = self.bias_global_handle[module] + bias_handle.wait() + else: + weight_handle = all_gather_raw_memory_pool( + module.weight, + self.process_group, + async_op=True, + module=module, + ) + self.fstp_global_handle[module] = weight_handle + weight_handle.wait() def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 if module in self.fstp_global_handle: @@ -245,12 +257,22 @@ def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # def _pre_backward_hook_for_head(module: nn.Module, grad_output): if self.is_forward is False: - self._all_gather_block_weight_memory_pool(gpc.config.NUM_LAYER - 1) + self._all_gather_block_weight_memory_pool(self.num_blocks - 1) def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 # wait handle for current module - weight_handle = self.fstp_global_handle[module] - weight_handle.wait() + if module in self.fstp_global_handle: + weight_handle = self.fstp_global_handle[module] + weight_handle.wait() + else: + weight_handle = all_gather_raw_memory_pool( + module.weight, + self.process_group, + async_op=True, + module=module, + ) + self.fstp_global_handle[module] = weight_handle + weight_handle.wait() # start the all-gather for next module module_index = self.fstp_modules.index(module) From 4c1cd5d49ba65fa903183bb1c6759a5e3f5f8b4b Mon Sep 17 00:00:00 2001 From: mwiacx <759046501@qq.com> Date: Tue, 31 Oct 2023 19:39:24 +0800 Subject: [PATCH 064/153] fix async reduce scatter --- internlm/model/overlap_handler.py | 11 +- .../solver/optimizer/hybrid_zero_optim.py | 136 ++++++++++-------- train.py | 17 +-- 3 files changed, 88 insertions(+), 76 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 8462def4..7805e111 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -328,13 +328,12 @@ class FSTPOverlapSchedulerHook(SchedulerHook): SchedulerHook for fstp overlap handler """ - def __init__(self, overlap_handler: FSTPOverlapHandler) -> None: - super().__init__() - + def __init__(self, overlap_handler: FSTPOverlapHandler, zero_optim) -> None: self._overlap_handler = overlap_handler + self._zero_optim = zero_optim def before_forward(self, scheduler, inputs) -> None: - if self._overlap_handler is not None: + if self._overlap_handler.model_checkpoint: self._overlap_handler.set_forward_mode(True) def after_forward(self, scheduler, outputs) -> None: @@ -347,11 +346,11 @@ def after_criterion(self, scheduler, loss) -> None: pass def before_backward(self, scheduler, outputs, outputs_grad) -> None: - if self._overlap_handler is not None: + if self._overlap_handler.model_checkpoint: self._overlap_handler.set_forward_mode(False) def after_backward(self, scheduler, inputs_grad) -> None: - pass + self._zero_optim.accumulate_left_grads_after_backward() def post_helper_func(self, scheduler, outputs, label) -> None: pass diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 19a79bfd..2d04bc64 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -66,10 +66,6 @@ def __init__( hysteresis = grad_scal_cfg.hysteresis max_scale = grad_scal_cfg.max_scale - self._fstp_handler = None - if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: - self._fstp_handler = gpc.fstp_handler - # Zero related args reduce_bucket_size = zero_cfg.reduce_bucket_size clip_grad_norm = zero_cfg.clip_grad_norm @@ -133,6 +129,12 @@ def __init__( if self._overlap_sync_param: assert self._param_bcast_sync_handler is not None + if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: + self._fstp_handler = gpc.fstp_handler + else: + self._fstp_handler = None + self._accum_grad_buckets: List[BucketStore] = [] + # iterate over the param group in the optimizer # partition these param groups for data parallel training # and add buffers to parameter store for future access @@ -221,8 +223,7 @@ def __init__( # reduction hook is only used if overlapping communication # if it is stage 1 without overlapping, no hook will be attached - if self._overlap_sync_grad: - self._attach_reduction_hook() + self._attach_reduction_hook() @property def zero_local_rank(self): @@ -289,60 +290,79 @@ def _attach_reduction_hook(self): param_group = self._fp16_param_groups[group_id] for param in param_group: # we should not reduce the param in moe - if param.requires_grad: - reduce_rank = None - - def _define_and_attach(param, reduce_rank=None): - # get the AccumulateGrad object of the param itself - # If these objects are not kept, reduction hooks may not be attached successfully. - accum_grad_obj = get_grad_accumulate_object(param) - self._grad_store.add_accumulate_grad_object(accum_grad_obj) - - reduction_func = partial( - self._store_and_try_reduce_grads_by_bucket, - param=param, - reduce_rank=reduce_rank, - ) + if not param.requires_grad: + continue - reduce_scatter_checker = partial( - self._wait_reduce_scatter_and_accumulate_grads, - param=param, - reduce_rank=reduce_rank, - ) - def reduction_sp_func(): - handle = reduce_tensor( - param.grad, - dtype=None, - dst_rank=reduce_rank, - parallel_mode=ParallelMode.TENSOR, - ) - handle.wait() - - # define hook - # NOT IMPORTANT BUT GOOD TO KNOW: - # args here is not grad, but allow_unreacable and accumulate_grad - def reduce_grad_hook(*args): # pylint: disable=W0613 - if self._fstp_handler is not None: - reduce_scatter_checker() - - if self.skip_grad_reduce is False: - reduction_func() - - # define hook for sequence_parallel - def reduce_grad_hook_sp(*args): # pylint: disable=W0613 - if self.skip_grad_reduce is False: - reduction_sp_func() - - # if sequence_parallel is True, - # the grad of norm should be all-reduce across the tp process group - if gpc.config.parallel.sequence_parallel is True: - if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True: - accum_grad_obj_sp = get_grad_accumulate_object(param) - accum_grad_obj_sp.register_hook(reduce_grad_hook_sp) + reduce_rank = None + def _define_and_attach(param, reduce_rank=None): + reduction_func = partial( + self._store_and_try_reduce_grads_by_bucket, + param=param, + reduce_rank=reduce_rank, + ) + + reduce_scatter_checker = partial( + self._wait_reduce_scatter_and_accumulate_grads, + param=param, + reduce_rank=reduce_rank, + ) + + def reduction_sp_func(): + handle = reduce_tensor( + param.grad, + dtype=None, + dst_rank=reduce_rank, + parallel_mode=ParallelMode.TENSOR, + ) + handle.wait() + + # define hook + # NOT IMPORTANT BUT GOOD TO KNOW: + # args here is not grad, but allow_unreacable and accumulate_grad + def reduce_grad_hook(*args): # pylint: disable=W0613 + if self.skip_grad_reduce is False: + reduction_func() + + # define hook for real gradient accumulation. + def accum_grad_hook(*args): # pylint: disable=W0613 + reduce_scatter_checker() + + # define hook for sequence_parallel + def reduce_grad_hook_sp(*args): # pylint: disable=W0613 + if self.skip_grad_reduce is False: + reduction_sp_func() + + # get the AccumulateGrad object of the param itself + # If these objects are not kept, reduction hooks may not be attached successfully. + accum_grad_obj = get_grad_accumulate_object(param) + self._grad_store.add_accumulate_grad_object(accum_grad_obj) + + # if sequence_parallel is True, + # the grad of norm should be all-reduce across the tp process group + if ( + gpc.config.parallel.sequence_parallel is True + and hasattr(param, IS_SEQUENCE_PARALLEL) + and getattr(param, IS_SEQUENCE_PARALLEL) is True + ): + accum_grad_obj.register_hook(reduce_grad_hook_sp) + + # we should not only register for parameters which have _fstp_reduce_scatter_str attr. + # we must keep up with reduce_grad_hook. + if self._fstp_handler is not None: + accum_grad_obj.register_hook(accum_grad_hook) + + if self._overlap_sync_grad: accum_grad_obj.register_hook(reduce_grad_hook) - _define_and_attach(param, reduce_rank) + _define_and_attach(param, reduce_rank) + + def accumulate_left_grads_after_backward(self): + if self._fstp_handler is None: + return + + for group_id in range(self.num_param_groups): + self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id]) def belongs_to_current_rank(self, param) -> bool: """ @@ -633,10 +653,6 @@ def step(self, closure=None): if param.grad is not None: self._store_and_try_reduce_grads_by_bucket(param) - # we need to accumulate gradients left in the accumulate gardient bucket - for group_id in range(self.num_param_groups): - self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id], reduce_rank=None) - # we need to reduce the gradients left in the communication bucket for group_id in range(self.num_param_groups): self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) diff --git a/train.py b/train.py index 45117623..644bbebc 100644 --- a/train.py +++ b/train.py @@ -5,7 +5,7 @@ import time import traceback from functools import partial -from typing import List, Optional +from typing import List import torch import torch.distributed as dist @@ -70,9 +70,7 @@ def initialize_llm_logger(start_time: str): return uniscale_logger -def get_scheduler_hooks( - metric: Optional[AccPerplex] = None, activation_checkpoint: bool = False -) -> List[SchedulerHook]: +def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]: scheduler_hooks: List[SchedulerHook] = [] if metric is not None: @@ -87,9 +85,8 @@ def get_scheduler_hooks( ), ), ) - - if activation_checkpoint: - scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler)) + if gpc.fstp_handler is not None: + scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler, zero_optim)) return scheduler_hooks @@ -112,7 +109,7 @@ def main(args): global_world_size=gpc.get_world_size(ParallelMode.GLOBAL), mlp_ratio=gpc.config.MLP_RATIO, ) - + get_tflops_func_2 = partial( get_megatron_flops_2, checkpoint=gpc.config.model.checkpoint, @@ -196,7 +193,7 @@ def main(args): train_dataloader=train_dl, lr_scheduler=lr_scheduler, beta2_scheduler=beta2_scheduler, - scheduler_hooks=get_scheduler_hooks(metric, gpc.config.model.checkpoint), + scheduler_hooks=get_scheduler_hooks(metric, optimizer), ) # initialize simple memory profiler @@ -323,7 +320,7 @@ def main(args): if memory_profiler is not None: memory_profiler.step() - + if batch_count % 2 == 0: prof.step() From 6b843253eb7ef6829daa966ff06de0889c664b1c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 31 Oct 2023 20:26:36 +0800 Subject: [PATCH 065/153] fix(optimizer/hybrid_zero_optim.py): remove redundant _accum_grad_buckets --- internlm/solver/optimizer/hybrid_zero_optim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 2d04bc64..0ab63960 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -133,7 +133,6 @@ def __init__( self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None - self._accum_grad_buckets: List[BucketStore] = [] # iterate over the param group in the optimizer # partition these param groups for data parallel training From b3def4c1628dbba652ffb9b089eeb7be9de584af Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 31 Oct 2023 20:40:58 +0800 Subject: [PATCH 066/153] fix(optimizer/hybrid_zero_optim.py): add reduce_scatter_overlap switch --- configs/7B_sft.py | 4 ++-- internlm/model/overlap_handler.py | 9 +++++---- internlm/model/utils.py | 4 ++-- internlm/solver/optimizer/hybrid_zero_optim.py | 5 +++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 2d6a3bee..b34a838b 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -57,7 +57,7 @@ # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=True, - total_steps=50, + total_steps=10, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), + tensor=dict(size=8, sp="intern", intern_overlap=True, reduce_scatter_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 7805e111..418c4aa7 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -70,10 +70,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non setattr(child, "_fstp_name", name) - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + if gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") self.num_blocks = len(self.index_to_fstp_modules) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 982c0e08..63dd09d7 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -568,7 +568,7 @@ def backward(ctx, grad_output, *args): total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - if overlap_handler is not None: + if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( grad_weight, process_group, async_op=True ) @@ -621,7 +621,7 @@ def backward(ctx, grad_output, *args): del total_weight if ctx.needs_input_grad[1]: - if world_size > 1 and overlap_handler is None: + if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)): handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 0ab63960..a8b524ac 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -133,6 +133,7 @@ def __init__( self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None + self._reduce_scatter_overlap = gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False) # iterate over the param group in the optimizer # partition these param groups for data parallel training @@ -348,7 +349,7 @@ def reduce_grad_hook_sp(*args): # pylint: disable=W0613 # we should not only register for parameters which have _fstp_reduce_scatter_str attr. # we must keep up with reduce_grad_hook. - if self._fstp_handler is not None: + if self._fstp_handler is not None and self._reduce_scatter_overlap is True: accum_grad_obj.register_hook(accum_grad_hook) if self._overlap_sync_grad: @@ -357,7 +358,7 @@ def reduce_grad_hook_sp(*args): # pylint: disable=W0613 _define_and_attach(param, reduce_rank) def accumulate_left_grads_after_backward(self): - if self._fstp_handler is None: + if self._fstp_handler is None or self._reduce_scatter_overlap is False: return for group_id in range(self.num_param_groups): From 10b5056e1ebfe540f1008c97f4b3bcdafe8b22da Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Wed, 1 Nov 2023 12:31:52 +0800 Subject: [PATCH 067/153] fix all-gather overlap the model_checkpoint is 0 --- configs/7B_sft.py | 8 ++++---- internlm/model/overlap_handler.py | 2 +- internlm/train/training_internlm.py | 6 +++++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index b34a838b..99285085 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -1,7 +1,7 @@ JOB_NAME = "7b_train" DO_ALERT = False -SEQ_LEN = 4096 +SEQ_LEN = 2048 HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 @@ -49,9 +49,9 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, + micro_num=4, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, + micro_bsz=2, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True, reduce_scatter_overlap=True), + tensor=dict(size=4, sp="intern", intern_overlap=True, reduce_scatter_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 418c4aa7..db811504 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -315,7 +315,7 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: # 1. register post_backward_hook @head module to prefetch for the last block's last module # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module # 3. register post_backward_hook @fstp_module to release resource - if self.model_checkpoint is False: + if not self.model_checkpoint: for head in self.head: head.register_full_backward_hook(_post_backward_hook_for_head) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2b806926..2b5a1bb4 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -407,6 +407,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): tgs_list = [] tflops_list = [] tflops_list_2 = [] +loss_list = [] @llm_timeout(func_name="record_current_batch_training_metrics") @@ -599,11 +600,12 @@ def record_current_batch_training_metrics( step_count=batch_count, cur_step_loss=loss.item(), ) - + loss_list.append(loss.item()) if batch_count >= 5: tgs_list.append(tgs_origin) tflops_list.append(tflops) tflops_list_2.append(tflops_2) + if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) avg_tgs = sum(tgs_list) / len(tgs_list) @@ -625,3 +627,5 @@ def record_current_batch_training_metrics( if abs(tf - avg_tflops_2) > 10: tflops_list_2.remove(tf) print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True) + + print("loss: ", loss_list, flush=True) From 48512913567ba88b3280ba660e0c3b5ac60cef55 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 2 Nov 2023 10:30:16 +0800 Subject: [PATCH 068/153] fix(optimizer/hybrid_zero_optim.py): fix bucket size full judge condition when reduce scatter overlap --- internlm/solver/optimizer/hybrid_zero_optim.py | 2 +- internlm/train/training_internlm.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index a8b524ac..1472aa85 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -404,7 +404,7 @@ def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional # check if the bucket is full # if full, will reduce the grads already in the bucket # after reduction, the bucket will be empty - if current_bucket.num_elements_in_bucket(reduce_rank) >= self._reduce_bucket_size: + if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: self._accum_grads_store_in_bucket(current_bucket, reduce_rank) # otherwise, add the parameter into bucket. diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2b5a1bb4..a05f62df 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -600,12 +600,12 @@ def record_current_batch_training_metrics( step_count=batch_count, cur_step_loss=loss.item(), ) + loss_list.append(loss.item()) if batch_count >= 5: tgs_list.append(tgs_origin) tflops_list.append(tflops) tflops_list_2.append(tflops_2) - if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) avg_tgs = sum(tgs_list) / len(tgs_list) @@ -627,5 +627,5 @@ def record_current_batch_training_metrics( if abs(tf - avg_tflops_2) > 10: tflops_list_2.remove(tf) print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True) - + print("loss: ", loss_list, flush=True) From 5a18b3b6510ab1f79065c6c0e67ecaa0e581a1af Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 2 Nov 2023 16:05:07 +0800 Subject: [PATCH 069/153] fix(model/overlap_handler.py): fix last block hook when pp with activation --- internlm/model/overlap_handler.py | 64 ++++++++++++++++--------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index db811504..ed0a8d22 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -6,6 +6,7 @@ import torch from torch import nn +from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel from internlm.core.scheduler import SchedulerHook @@ -32,6 +33,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle self.module_to_index = dict() # key: fstp module; value: transformer block index self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules + self.last_block = None self.head = [] self.embedding = [] self.model_checkpoint = gpc.config.model.checkpoint @@ -54,6 +56,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non elif isinstance(children, Embedding1D): self.embedding.append(children) elif isinstance(children, nn.ModuleList): + self.last_block = children[len(children) - 1] for idx, block in enumerate(children): self.index_to_fstp_modules[idx] = [] for _sub_name, sub in block.named_children(): @@ -150,39 +153,23 @@ def get_bias_memory(self, module: nn.Module): return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] def get_reduce_scatter_memory(self, key): - return_idx = 0 - # if key not in dict if key not in self.reduce_scatter_memory_pool: self.reduce_scatter_memory_pool[key] = [] - # if the data is empty - if len(self.reduce_scatter_memory_pool[key]) == 0: - self.reduce_scatter_memory_pool[key].append( - torch.zeros( - key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() - ).contiguous() - ) - setattr(self.reduce_scatter_memory_pool[key][return_idx], "idle", False) - setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx) - return self.reduce_scatter_memory_pool[key][return_idx] - else: # if not empty - for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]): - if mem_item.idle is True: - self.reduce_scatter_memory_pool[key][index].idle = False - return_idx = index - return self.reduce_scatter_memory_pool[key][return_idx] - # if the memory pool is all used - cur_len = len(self.reduce_scatter_memory_pool[key]) - self.reduce_scatter_memory_pool[key].append( - torch.zeros( - key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device() - ).contiguous() - ) - setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False) - return_idx = cur_len - setattr(self.reduce_scatter_memory_pool[key][return_idx], "index", return_idx) - return self.reduce_scatter_memory_pool[key][return_idx] + for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]): + if mem_item.idle is True: + self.reduce_scatter_memory_pool[key][index].idle = False + return self.reduce_scatter_memory_pool[key][index] + + # if the memory pool is all used + cur_len = len(self.reduce_scatter_memory_pool[key]) + self.reduce_scatter_memory_pool[key].append( + torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() + ) + setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False) + setattr(self.reduce_scatter_memory_pool[key][cur_len], "index", cur_len) + return self.reduce_scatter_memory_pool[key][cur_len] def release_reduce_scatter_memory(self, key, index): self.reduce_scatter_memory_pool[key][index].idle = True @@ -242,6 +229,18 @@ def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: dis self.fstp_global_handle[module] = weight_handle weight_handle.wait() + def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disable=W0613 + fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1] + if module in fstp_modules: + weight_handle = all_gather_raw_memory_pool( + module.weight, + self.process_group, + async_op=True, + module=module, + ) + self.fstp_global_handle[module] = weight_handle + weight_handle.wait() + def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 if module in self.fstp_global_handle: del self.fstp_global_handle[module] @@ -301,8 +300,11 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: embedding.register_forward_hook(_post_forward_hook_for_embedding) if self.model_checkpoint: - for head in self.head: - head.register_full_backward_pre_hook(_pre_backward_hook_for_head) + if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): + for head in self.head: + head.register_full_backward_pre_hook(_pre_backward_hook_for_head) + else: + self.last_block.register_forward_pre_hook(_pre_forward_hook_for_block) for out_proj in self.fstp_outs: out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) From 9b1265c59107edd44063684c96446af20892fd25 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 6 Nov 2023 10:45:08 +0800 Subject: [PATCH 070/153] modify the sp allreduce and support tf32 for fstp linear --- .gitignore | 2 + configs/generate.py | 8 ++ internlm/model/utils.py | 102 +++++++++++++++++- .../solver/optimizer/hybrid_zero_optim.py | 43 ++++---- 4 files changed, 129 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 9bdc7ec7..ef18a4a4 100644 --- a/.gitignore +++ b/.gitignore @@ -152,6 +152,8 @@ fstp_logs/ configs/7B_train/* configs/13B_train/* configs/30B_train/* +configs/test_loss/* +loss_tensorboard/* atb pip diff --git a/configs/generate.py b/configs/generate.py index 038998c7..5f044e72 100644 --- a/configs/generate.py +++ b/configs/generate.py @@ -39,6 +39,14 @@ log_name = root_name + "_" + output_file_name[:-3] + skip = True + + if sp_mode == "intern" and intern_overlap[i] is True: + skip = False + + if skip: + continue + print(log_name) command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" process = subprocess.Popen(command, shell=True, executable='/bin/bash') diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 63dd09d7..4f197b1f 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -627,6 +627,104 @@ def backward(ctx, grad_output, *args): handle_grad_bias.wait() return grad_input, grad_weight, grad_bias, None, None, None, None, None, None +class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc): + "FusedDenseFunc for FSTP, which is optimized based on flash implementation." + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + (grad_input,) = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + overlap_handler = ctx.overlap_handler + module = ctx.module + + if ctx.compute_weight_gradient: + x, weight, bias = ctx.saved_tensors + total_x = x + else: + weight, bias = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + + world_size = gpc.get_world_size(ParallelMode.TENSOR) + if world_size > 1: + if overlap_handler is not None: + total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) + else: + total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) + handle_weight.wait() + else: + total_weight = weight + + # compute weight grad + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + grad_weight, grad_bias = linear_bias_wgrad_torch( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + if world_size > 1: + if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): + grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( + grad_weight, process_group, async_op=True + ) + assert hasattr(weight, "_fstp_reduce_scatter_str") + overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( + handle_grad_weight, + grad_weight_async, + ) + grad_weight = overlap_handler.get_zero_by_shape( + ( + grad_weight.shape[0] // torch.distributed.get_world_size(process_group), + *grad_weight.shape[1:], + ), + dtype=grad_weight.dtype, + device=grad_weight.device, + ) + if grad_bias is not None: + grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( + grad_bias, process_group, async_op=True + ) + assert hasattr(bias, "_fstp_reduce_scatter_str") + overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( + handle_grad_bias, + grad_bias_async, + ) + grad_bias = overlap_handler.get_zero_by_shape( + ( + grad_bias.shape[0] // torch.distributed.get_world_size(process_group), + *grad_bias.shape[1:], + ), + dtype=grad_bias.dtype, + device=grad_bias.device, + ) + else: + grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + if grad_bias is not None: + grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, total_weight.t()) + else: + grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + else: + grad_input = None + del total_weight + + if ctx.needs_input_grad[1]: + if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)): + handle_grad_weight.wait() + if grad_bias is not None: + handle_grad_bias.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None, None, None def fused_dense_func_torch( x: Tensor, @@ -683,9 +781,7 @@ def fstp_fused_dense_func( if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler) else: - assert process_group is None - out = F.linear(x, weight, bias) - return out if not return_residual else (out, x) + return FSTPFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, module, handler) def try_import_RMSNorm(): diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 1472aa85..b2b16dcc 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -308,15 +308,6 @@ def _define_and_attach(param, reduce_rank=None): reduce_rank=reduce_rank, ) - def reduction_sp_func(): - handle = reduce_tensor( - param.grad, - dtype=None, - dst_rank=reduce_rank, - parallel_mode=ParallelMode.TENSOR, - ) - handle.wait() - # define hook # NOT IMPORTANT BUT GOOD TO KNOW: # args here is not grad, but allow_unreacable and accumulate_grad @@ -328,25 +319,11 @@ def reduce_grad_hook(*args): # pylint: disable=W0613 def accum_grad_hook(*args): # pylint: disable=W0613 reduce_scatter_checker() - # define hook for sequence_parallel - def reduce_grad_hook_sp(*args): # pylint: disable=W0613 - if self.skip_grad_reduce is False: - reduction_sp_func() - # get the AccumulateGrad object of the param itself # If these objects are not kept, reduction hooks may not be attached successfully. accum_grad_obj = get_grad_accumulate_object(param) self._grad_store.add_accumulate_grad_object(accum_grad_obj) - # if sequence_parallel is True, - # the grad of norm should be all-reduce across the tp process group - if ( - gpc.config.parallel.sequence_parallel is True - and hasattr(param, IS_SEQUENCE_PARALLEL) - and getattr(param, IS_SEQUENCE_PARALLEL) is True - ): - accum_grad_obj.register_hook(reduce_grad_hook_sp) - # we should not only register for parameters which have _fstp_reduce_scatter_str attr. # we must keep up with reduce_grad_hook. if self._fstp_handler is not None and self._reduce_scatter_overlap is True: @@ -644,6 +621,26 @@ def step(self, closure=None): """ assert closure is None, "closure is not supported by step()" + # do all-reduce for layernorm when sequence_parallel is True + if gpc.config.parallel.sequence_parallel is True: + for group_id in range(len(self._fp16_param_groups)): + norm_bucket = TensorBucket(size=0) + for param in self._fp16_param_groups[group_id]: + if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True: + norm_bucket.add_to_bucket(param.grad, allow_oversize=True) + # import pdb; pdb.set_trace() + if not norm_bucket.is_empty(): + norm_bucket.flatten() + norm_bucket.commu_handle = reduce_tensor( + tensor=norm_bucket.get_flat_tensor(), + dtype=None, + dst_rank=None, + parallel_mode=ParallelMode.TENSOR, + ) + norm_bucket.commu_handle.wait() + norm_bucket.unflatten_and_copy() + # norm_bucket.empty() + # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients if not self._overlap_sync_grad: From c517ec5b8cdf9c675f97dcc615bfd39c2ffda010 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 6 Nov 2023 11:57:14 +0800 Subject: [PATCH 071/153] feat(model/overlap_handler.py): delete reduce_scatter_overlap switch --- configs/7B_sft.py | 2 +- internlm/model/overlap_handler.py | 9 ++++----- internlm/model/utils.py | 11 +++++++---- internlm/solver/optimizer/hybrid_zero_optim.py | 17 +++++++---------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 99285085..e85d2df8 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=4, sp="intern", intern_overlap=True, reduce_scatter_overlap=True), + tensor=dict(size=4, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index ed0a8d22..e3198bb7 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -73,11 +73,10 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non setattr(child, "_fstp_name", name) - if gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") self.num_blocks = len(self.index_to_fstp_modules) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 4f197b1f..556752aa 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -568,7 +568,7 @@ def backward(ctx, grad_output, *args): total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): + if overlap_handler is not None: grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( grad_weight, process_group, async_op=True ) @@ -621,14 +621,16 @@ def backward(ctx, grad_output, *args): del total_weight if ctx.needs_input_grad[1]: - if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)): + if world_size > 1 and overlap_handler is None: handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() return grad_input, grad_weight, grad_bias, None, None, None, None, None, None + class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc): "FusedDenseFunc for FSTP, which is optimized based on flash implementation." + @staticmethod @custom_bwd def backward(ctx, grad_output, *args): @@ -667,7 +669,7 @@ def backward(ctx, grad_output, *args): total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] ) if world_size > 1: - if overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False): + if overlap_handler is not None: grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( grad_weight, process_group, async_op=True ) @@ -720,12 +722,13 @@ def backward(ctx, grad_output, *args): del total_weight if ctx.needs_input_grad[1]: - if world_size > 1 and not (overlap_handler is not None and gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False)): + if world_size > 1 and overlap_handler is None: handle_grad_weight.wait() if grad_bias is not None: handle_grad_bias.wait() return grad_input, grad_weight, grad_bias, None, None, None, None, None, None + def fused_dense_func_torch( x: Tensor, weight: Tensor, diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index b2b16dcc..9a277ae4 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -133,7 +133,6 @@ def __init__( self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None - self._reduce_scatter_overlap = gpc.config.parallel["tensor"].get("reduce_scatter_overlap", False) # iterate over the param group in the optimizer # partition these param groups for data parallel training @@ -326,7 +325,7 @@ def accum_grad_hook(*args): # pylint: disable=W0613 # we should not only register for parameters which have _fstp_reduce_scatter_str attr. # we must keep up with reduce_grad_hook. - if self._fstp_handler is not None and self._reduce_scatter_overlap is True: + if self._fstp_handler is not None: accum_grad_obj.register_hook(accum_grad_hook) if self._overlap_sync_grad: @@ -335,7 +334,7 @@ def accum_grad_hook(*args): # pylint: disable=W0613 _define_and_attach(param, reduce_rank) def accumulate_left_grads_after_backward(self): - if self._fstp_handler is None or self._reduce_scatter_overlap is False: + if self._fstp_handler is None: return for group_id in range(self.num_param_groups): @@ -628,18 +627,16 @@ def step(self, closure=None): for param in self._fp16_param_groups[group_id]: if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True: norm_bucket.add_to_bucket(param.grad, allow_oversize=True) - # import pdb; pdb.set_trace() if not norm_bucket.is_empty(): norm_bucket.flatten() norm_bucket.commu_handle = reduce_tensor( - tensor=norm_bucket.get_flat_tensor(), - dtype=None, - dst_rank=None, - parallel_mode=ParallelMode.TENSOR, - ) + tensor=norm_bucket.get_flat_tensor(), + dtype=None, + dst_rank=None, + parallel_mode=ParallelMode.TENSOR, + ) norm_bucket.commu_handle.wait() norm_bucket.unflatten_and_copy() - # norm_bucket.empty() # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients From 7c6d2936b352775443948010a9cfb9ba06080e85 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 6 Nov 2023 12:04:01 +0800 Subject: [PATCH 072/153] reset the sp allreduce in optimizer --- .../solver/optimizer/hybrid_zero_optim.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index b2b16dcc..1472aa85 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -308,6 +308,15 @@ def _define_and_attach(param, reduce_rank=None): reduce_rank=reduce_rank, ) + def reduction_sp_func(): + handle = reduce_tensor( + param.grad, + dtype=None, + dst_rank=reduce_rank, + parallel_mode=ParallelMode.TENSOR, + ) + handle.wait() + # define hook # NOT IMPORTANT BUT GOOD TO KNOW: # args here is not grad, but allow_unreacable and accumulate_grad @@ -319,11 +328,25 @@ def reduce_grad_hook(*args): # pylint: disable=W0613 def accum_grad_hook(*args): # pylint: disable=W0613 reduce_scatter_checker() + # define hook for sequence_parallel + def reduce_grad_hook_sp(*args): # pylint: disable=W0613 + if self.skip_grad_reduce is False: + reduction_sp_func() + # get the AccumulateGrad object of the param itself # If these objects are not kept, reduction hooks may not be attached successfully. accum_grad_obj = get_grad_accumulate_object(param) self._grad_store.add_accumulate_grad_object(accum_grad_obj) + # if sequence_parallel is True, + # the grad of norm should be all-reduce across the tp process group + if ( + gpc.config.parallel.sequence_parallel is True + and hasattr(param, IS_SEQUENCE_PARALLEL) + and getattr(param, IS_SEQUENCE_PARALLEL) is True + ): + accum_grad_obj.register_hook(reduce_grad_hook_sp) + # we should not only register for parameters which have _fstp_reduce_scatter_str attr. # we must keep up with reduce_grad_hook. if self._fstp_handler is not None and self._reduce_scatter_overlap is True: @@ -621,26 +644,6 @@ def step(self, closure=None): """ assert closure is None, "closure is not supported by step()" - # do all-reduce for layernorm when sequence_parallel is True - if gpc.config.parallel.sequence_parallel is True: - for group_id in range(len(self._fp16_param_groups)): - norm_bucket = TensorBucket(size=0) - for param in self._fp16_param_groups[group_id]: - if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True: - norm_bucket.add_to_bucket(param.grad, allow_oversize=True) - # import pdb; pdb.set_trace() - if not norm_bucket.is_empty(): - norm_bucket.flatten() - norm_bucket.commu_handle = reduce_tensor( - tensor=norm_bucket.get_flat_tensor(), - dtype=None, - dst_rank=None, - parallel_mode=ParallelMode.TENSOR, - ) - norm_bucket.commu_handle.wait() - norm_bucket.unflatten_and_copy() - # norm_bucket.empty() - # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients if not self._overlap_sync_grad: From b5e4d04a9a410aec027a1273eae2d3687ae27834 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 6 Nov 2023 12:08:31 +0800 Subject: [PATCH 073/153] fix conflicts --- .../solver/optimizer/hybrid_zero_optim.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index e5927e65..b033539d 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -643,27 +643,6 @@ def step(self, closure=None): """ assert closure is None, "closure is not supported by step()" -<<<<<<< HEAD -======= - # do all-reduce for layernorm when sequence_parallel is True - if gpc.config.parallel.sequence_parallel is True: - for group_id in range(len(self._fp16_param_groups)): - norm_bucket = TensorBucket(size=0) - for param in self._fp16_param_groups[group_id]: - if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True: - norm_bucket.add_to_bucket(param.grad, allow_oversize=True) - if not norm_bucket.is_empty(): - norm_bucket.flatten() - norm_bucket.commu_handle = reduce_tensor( - tensor=norm_bucket.get_flat_tensor(), - dtype=None, - dst_rank=None, - parallel_mode=ParallelMode.TENSOR, - ) - norm_bucket.commu_handle.wait() - norm_bucket.unflatten_and_copy() - ->>>>>>> c517ec5b8cdf9c675f97dcc615bfd39c2ffda010 # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients if not self._overlap_sync_grad: From 74754397df336db3c9fd03fb297792f8c4b546d8 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 13 Nov 2023 21:09:59 +0800 Subject: [PATCH 074/153] feat(model/overlap_handler.py): add memory_pool switch and refactor overlap handler --- configs/7B_sft.py | 2 +- internlm/model/overlap_handler.py | 197 ++++++++++-------- internlm/model/utils.py | 23 +- .../solver/optimizer/hybrid_zero_optim.py | 4 +- train.py | 2 +- 5 files changed, 130 insertions(+), 98 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index e85d2df8..63fa67e4 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -163,7 +163,7 @@ """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=4, sp="intern", intern_overlap=True), + tensor=dict(size=4, sp="intern", intern_overlap=True, memory_pool=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index e3198bb7..cb00d229 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -13,6 +13,7 @@ from internlm.model.embedding import Embedding1D from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear from internlm.model.utils import ( + all_gather_raw, all_gather_raw_bias_memory_pool, all_gather_raw_memory_pool, ) @@ -29,14 +30,17 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.fstp_outs = [] self.fstp_modules = [] self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.fstp_global_handle = dict() # key: fstp module; value: module global all-gather op handle + self.weight_global_handle = dict() # key: fstp module; value: module global all-gather op handle self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle + self.weight_global_output = dict() # key: fstp module; value: module global weight after all-gather op + self.bias_global_output = dict() # key: fstp module; value: module bias global weight after all-gather op self.module_to_index = dict() # key: fstp module; value: transformer block index self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules self.last_block = None self.head = [] self.embedding = [] self.model_checkpoint = gpc.config.model.checkpoint + self.enable_memory_pool = gpc.config.parallel["tensor"].get("memory_pool", False) self.is_forward = True self.reduce_scatter_handlers = {} @@ -60,34 +64,36 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non for idx, block in enumerate(children): self.index_to_fstp_modules[idx] = [] for _sub_name, sub in block.named_children(): - sub_modules = list(sub.children()) - if len(sub_modules) > 0: - for name, child in sub.named_children(): - if name == "out_proj": - self.fstp_outs.append(child) - self.module_to_index[child] = idx - if isinstance(child, FSTPLinear): - self.module_to_index[child] = idx - self.fstp_modules.append(child) - self.index_to_fstp_modules[idx].append(child) - - setattr(child, "_fstp_name", name) - - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") + for name, child in sub.named_children(): + if name == "out_proj": + self.fstp_outs.append(child) + self.module_to_index[child] = idx + if isinstance(child, FSTPLinear): + self.module_to_index[child] = idx + self.fstp_modules.append(child) + self.index_to_fstp_modules[idx].append(child) + + setattr(child, "_fstp_name", name) + + _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" + setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") + if child.bias is not None: + setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") self.num_blocks = len(self.index_to_fstp_modules) - self._initialize_memory_pool() + if self.enable_memory_pool: + self._initialize_memory_pool() self._register_sync_parameters_hook() def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: - if size not in self.zero_const_pool: - self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() + if self.enable_memory_pool: + if size not in self.zero_const_pool: + self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() - return self.zero_const_pool[size] + return self.zero_const_pool[size] + else: + return torch.zeros(*size, dtype=dtype, device=device).contiguous() def set_forward_mode(self, flag): self.is_forward = flag @@ -122,14 +128,20 @@ def _initialize_memory_pool(self) -> None: self.all_gather_memory_pool.append(weight) # containing two groups of block weight def clear_memory_pool(self) -> None: + assert self.enable_memory_pool + self.zero_const_pool = {} self.reduce_scatter_memory_pool = {} - def get_all_gather_memory(self, module): + def _get_weight_from_memory_pool(self, module): + assert self.enable_memory_pool + block_index = self.module_to_index[module] return self.all_gather_memory_pool[block_index % 2][module._fstp_name] - def get_bias_memory(self, module: nn.Module): + def _get_bias_from_memory_pool(self, module: nn.Module): + assert self.enable_memory_pool + block_index = self.module_to_index[module] # if the bias memory pool is empty or module has been not allocated memory if len(self.all_gather_bias_memory_pool) == 0: @@ -151,7 +163,21 @@ def get_bias_memory(self, module: nn.Module): return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] + def get_weight_all_gather(self, module): + if self.enable_memory_pool: + return self._get_weight_from_memory_pool(module) + else: + return self.weight_global_output[module] + + def get_bias_all_gather(self, module): + if self.enable_memory_pool: + return self._get_bias_from_memory_pool(module) + else: + return self.bias_global_output[module] + def get_reduce_scatter_memory(self, key): + assert self.enable_memory_pool + # if key not in dict if key not in self.reduce_scatter_memory_pool: self.reduce_scatter_memory_pool[key] = [] @@ -171,11 +197,11 @@ def get_reduce_scatter_memory(self, key): return self.reduce_scatter_memory_pool[key][cur_len] def release_reduce_scatter_memory(self, key, index): + assert self.enable_memory_pool self.reduce_scatter_memory_pool[key][index].idle = True - def _all_gather_block_weight_memory_pool(self, block_index: int): - fstp_modules = self.index_to_fstp_modules[block_index] - for module in fstp_modules: + def _all_gather_module_weight(self, module): + if self.enable_memory_pool: if module.bias is not None: bias_handle = all_gather_raw_bias_memory_pool( module.bias, @@ -191,103 +217,102 @@ def _all_gather_block_weight_memory_pool(self, block_index: int): async_op=True, module=module, ) - self.fstp_global_handle[module] = weight_handle + self.weight_global_handle[module] = weight_handle + else: + if module.bias is not None: + bias_output, bias_handle = all_gather_raw( + module.bias, + self.process_group, + async_op=True, + ) + self.bias_global_handle[module] = bias_handle + self.bias_global_output[module] = bias_output + + weight_output, weight_handle = all_gather_raw( + module.weight, + self.process_group, + async_op=True, + ) + self.weight_global_handle[module] = weight_handle + self.weight_global_output[module] = weight_output + + def _all_gather_block_weight(self, block_index: int): + fstp_modules = self.index_to_fstp_modules[block_index] + for module in fstp_modules: + self._all_gather_module_weight(module) def _register_sync_parameters_hook(self) -> None: """ register forward hooks and backward hooks for fstp modules. """ + def _wait_handle(module): + handle = self.weight_global_handle[module] + handle.wait() + if module.bias is not None: + bias_handle = self.bias_global_handle[module] + bias_handle.wait() + + def _clear_handle(module): + if module in self.weight_global_handle: + del self.weight_global_handle[module] + if module in self.bias_global_handle: + del self.bias_global_handle[module] + # if module in self.weight_global_output: + # del self.weight_global_output[module] + # if module in self.bias_global_output: + # del self.bias_global_output[module] + def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 - self._all_gather_block_weight_memory_pool(0) + self._all_gather_block_weight(0) def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # pylint: disable=W0613 block_index = self.module_to_index[module] if self.model_checkpoint and self.is_forward is False: if block_index - 1 >= 0: - self._all_gather_block_weight_memory_pool(block_index - 1) + self._all_gather_block_weight(block_index - 1) else: # start the all-gather for next block if block_index + 1 < self.num_blocks: - self._all_gather_block_weight_memory_pool(block_index + 1) + self._all_gather_block_weight(block_index + 1) def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: disable=W0613 - if module in self.fstp_global_handle: - handle = self.fstp_global_handle[module] - handle.wait() - if module.bias is not None: - bias_handle = self.bias_global_handle[module] - bias_handle.wait() - else: - weight_handle = all_gather_raw_memory_pool( - module.weight, - self.process_group, - async_op=True, - module=module, - ) - self.fstp_global_handle[module] = weight_handle - weight_handle.wait() + if module not in self.weight_global_handle: + self._all_gather_module_weight(module) + + _wait_handle(module) def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disable=W0613 fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1] if module in fstp_modules: - weight_handle = all_gather_raw_memory_pool( - module.weight, - self.process_group, - async_op=True, - module=module, - ) - self.fstp_global_handle[module] = weight_handle - weight_handle.wait() + self._all_gather_module_weight(module) + _wait_handle(module) def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 - if module in self.fstp_global_handle: - del self.fstp_global_handle[module] + _clear_handle(module) def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # pylint: disable=W0613 - first_backward_module = self.fstp_modules[-1] - weight_handle = all_gather_raw_memory_pool( - first_backward_module.weight, - self.process_group, - async_op=True, - module=first_backward_module, - ) - self.fstp_global_handle[first_backward_module] = weight_handle + self._all_gather_module_weight(self.fstp_modules[-1]) def _pre_backward_hook_for_head(module: nn.Module, grad_output): if self.is_forward is False: - self._all_gather_block_weight_memory_pool(self.num_blocks - 1) + self._all_gather_block_weight(self.num_blocks - 1) def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 # wait handle for current module - if module in self.fstp_global_handle: - weight_handle = self.fstp_global_handle[module] - weight_handle.wait() - else: - weight_handle = all_gather_raw_memory_pool( - module.weight, - self.process_group, - async_op=True, - module=module, - ) - self.fstp_global_handle[module] = weight_handle - weight_handle.wait() + if module not in self.weight_global_handle: + self._all_gather_module_weight(module) + + _wait_handle(module) # start the all-gather for next module module_index = self.fstp_modules.index(module) if module_index - 1 >= 0: next_module = self.fstp_modules[module_index - 1] - weight_handle = all_gather_raw_memory_pool( - next_module.weight, - self.process_group, - async_op=True, - module=next_module, - ) - self.fstp_global_handle[next_module] = weight_handle + self._all_gather_module_weight(next_module) def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: disable=W0613 - if module in self.fstp_global_handle: - del self.fstp_global_handle[module] + _clear_handle(module) # register forward hooks # 1. register post_forward_hook @embedding module to prefetch for block 0 diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 556752aa..45d2f51a 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -132,7 +132,7 @@ def all_gather_raw_memory_pool( module: nn.Module = None, ): handle = torch.distributed.all_gather_into_tensor( - gpc.fstp_handler.get_all_gather_memory(module=module), + gpc.fstp_handler.get_weight_all_gather(module=module), input_.contiguous(), group=process_group, async_op=async_op, @@ -147,7 +147,7 @@ def all_gather_raw_bias_memory_pool( module: nn.Module = None, ): handle = torch.distributed.all_gather_into_tensor( - gpc.fstp_handler.get_bias_memory(module=module), + gpc.fstp_handler.get_bias_all_gather(module=module), input_.contiguous(), group=process_group, async_op=async_op, @@ -177,8 +177,13 @@ def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bo def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 - size = (input_.shape[0] // world_size, *input_.shape[1:]) - output = gpc.fstp_handler.get_reduce_scatter_memory(size) + if gpc.fstp_handler.enable_memory_pool: + size = (input_.shape[0] // world_size, *input_.shape[1:]) + output = gpc.fstp_handler.get_reduce_scatter_memory(size) + else: + output = torch.empty( + input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device + ).contiguous() handle = torch.distributed.reduce_scatter_tensor( output, input_.contiguous(), group=process_group, async_op=async_op ) @@ -493,14 +498,14 @@ def forward( if world_size > 1: # do all_gather for weight and bias before actual computation if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) + total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() - # TODO memory pool for bias + if bias is not None: if overlap_handler is not None: - total_bias = gpc.fstp_handler.get_bias_memory(module=module) + total_bias = gpc.fstp_handler.get_bias_all_gather(module=module) else: total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) handle_bias.wait() @@ -554,7 +559,7 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) + total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() @@ -655,7 +660,7 @@ def backward(ctx, grad_output, *args): world_size = gpc.get_world_size(ParallelMode.TENSOR) if world_size > 1: if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_all_gather_memory(module=module) + total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) else: total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) handle_weight.wait() diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index b033539d..3092a625 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -389,7 +389,9 @@ def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optiona _param.grad.add_(_grad) # release cuda memory. - self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) + if self._fstp_handler.enable_memory_pool: + self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) + _grad = None self._fstp_handler.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) diff --git a/train.py b/train.py index 644bbebc..5ea91e8c 100644 --- a/train.py +++ b/train.py @@ -324,7 +324,7 @@ def main(args): if batch_count % 2 == 0: prof.step() - if gpc.fstp_handler is not None: + if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool: gpc.fstp_handler.clear_memory_pool() # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() From 3c07423151924f7350d8e7f7b93d8150721c61df Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 14 Nov 2023 11:30:26 +0800 Subject: [PATCH 075/153] feat(model/overlap_handler.py): release weight --- internlm/model/overlap_handler.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index cb00d229..715fa467 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -258,10 +258,12 @@ def _clear_handle(module): del self.weight_global_handle[module] if module in self.bias_global_handle: del self.bias_global_handle[module] - # if module in self.weight_global_output: - # del self.weight_global_output[module] - # if module in self.bias_global_output: - # del self.bias_global_output[module] + + def _clear_weight(module): + if module in self.weight_global_output: + del self.weight_global_output[module] + if module in self.bias_global_output: + del self.bias_global_output[module] def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 self._all_gather_block_weight(0) @@ -290,6 +292,8 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disab def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 _clear_handle(module) + if not self.model_checkpoint: + _clear_weight(module) def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # pylint: disable=W0613 self._all_gather_module_weight(self.fstp_modules[-1]) @@ -313,6 +317,7 @@ def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: di def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: disable=W0613 _clear_handle(module) + _clear_weight(module) # register forward hooks # 1. register post_forward_hook @embedding module to prefetch for block 0 From a1fd8778288b0ab76d20fa39290c8fe62cd5e654 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 15 Nov 2023 14:40:06 +0800 Subject: [PATCH 076/153] fix(train.py): clear memory pool before optim step --- train.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 5ea91e8c..789094ac 100644 --- a/train.py +++ b/train.py @@ -220,7 +220,7 @@ def main(args): # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps): empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval) - # torch.cuda.memory._record_memory_history() + torch.cuda.memory._record_memory_history() start_time = time.time() timer("one-batch").start() @@ -262,6 +262,9 @@ def main(args): ) timer("fwd-bwd").stop() + if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool: + gpc.fstp_handler.clear_memory_pool() + # update parameters, and returns (success_update, grad_norm) trainer_result = trainer.step() assert trainer_result is not None @@ -324,9 +327,7 @@ def main(args): if batch_count % 2 == 0: prof.step() - if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool: - gpc.fstp_handler.clear_memory_pool() - # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() ckpt_manager.wait_async_upload_finish() @@ -353,3 +354,5 @@ def main(args): mm.monitor_exception( alert_address=gpc.config.monitor.alert.feishu_alert_address, excp_info=traceback.format_exc() ) + + torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") From a80fcf8628bcfde37b65e09899679c814224a4e3 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 28 Nov 2023 19:33:55 +0800 Subject: [PATCH 077/153] feat(model): refactor weight and os and data patition strategy --- configs/7B_sft.py | 6 +- internlm/core/context/__init__.py | 1 + internlm/core/context/parallel_context.py | 39 +- .../core/context/process_group_initializer.py | 378 +++++-- internlm/initialize/launch.py | 7 + internlm/model/embedding.py | 56 +- internlm/model/linear.py | 23 + internlm/model/modeling_internlm.py | 31 +- internlm/model/multi_head_attention.py | 13 +- internlm/model/overlap_handler.py | 28 +- internlm/model/utils.py | 4 +- .../solver/optimizer/hybrid_zero_optim2.py | 983 ++++++++++++++++++ internlm/solver/optimizer/utils.py | 14 +- internlm/train/training_internlm.py | 6 +- internlm/utils/parallel.py | 12 +- train.py | 6 +- 16 files changed, 1498 insertions(+), 109 deletions(-) create mode 100644 internlm/solver/optimizer/hybrid_zero_optim2.py diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 63fa67e4..3c491660 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -162,9 +162,11 @@ defaults to False. """ parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=4, sp="intern", intern_overlap=True, memory_pool=True), + zero1=dict(size=2, fsdp=False), + tensor=dict(size=1, sp="intern", intern_overlap=False, memory_pool=False), pipeline=dict(size=1, interleaved_overlap=True), + weight=dict(size=8, overlap=True, memory_pool=True), + sequence=4, ) cudnn_deterministic = False diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index 6f1142cb..e17b4ba3 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -1,6 +1,7 @@ from .parallel_context import ( IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, + IS_WEIGHT_PARALLEL, Config, ParallelContext, global_context, diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 633dfe40..8d34f608 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -26,6 +26,7 @@ IS_TENSOR_PARALLEL = "is_tensor_parallel" IS_SEQUENCE_PARALLEL = "is_sequence_parallel" +IS_WEIGHT_PARALLEL = "is_weight_parallel" logger = get_logger(__file__) @@ -289,10 +290,15 @@ def is_first_rank(self, parallel_mode: ParallelMode): def is_rank_for_log(self): """Returns a boolean value indicating whether the current device should print log.""" + # is_log_rank = ( + # self.is_first_rank(ParallelMode.DATA) + # and self.is_first_rank(ParallelMode.TENSOR) + # and self.is_last_rank(ParallelMode.PIPELINE) + # ) is_log_rank = ( - self.is_first_rank(ParallelMode.DATA) - and self.is_first_rank(ParallelMode.TENSOR) - and self.is_last_rank(ParallelMode.PIPELINE) + self.is_first_rank(ParallelMode.WEIGHT) + and self.is_first_rank(ParallelMode.DATA) + and self.is_first_rank(ParallelMode.WEIGHT_DATA) ) return is_log_rank @@ -426,11 +432,11 @@ def check_sanity(self): pps = self.pipeline_parallel_size tps = self.tensor_parallel_size ws = self.world_size - assert ws == dps * pps * tps, ( - f"Expected the world size {ws} to be equal to data" - f" parallel size ({dps}) * pipeline parallel size " - f"({pps}) * tensor parallel size ({tps})" - ) + # assert ws == dps * pps * tps, ( + # f"Expected the world size {ws} to be equal to data" + # f" parallel size ({dps}) * pipeline parallel size " + # f"({pps}) * tensor parallel size ({tps})" + # ) assert self.zero1_parallel_size > 0 assert self.data_parallel_size % self.zero1_parallel_size == 0 @@ -467,20 +473,23 @@ def init_parallel_groups(self): # set parallel size as attributes for global context parallel_config = self.config.get("parallel", None) if parallel_config is not None: + self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size") + self._set_parallel_size_from_config(parallel_config, "sequence", "sequence_parallel_size") self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size") self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size") self._set_parallel_size_from_config(parallel_config, "zero1", "zero1_parallel_size") # the user should not set the data parallel size manually # instead, it should be calculated based on other parallel config - self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size) + assert self.tensor_parallel_size == 1 + assert self.pipeline_parallel_size == 1 + assert self.zero1_parallel_size >= 1 + self.data_parallel_size = self.world_size // self.sequence_parallel_size + self.weight_data_parallel_size = self.world_size // self.weight_parallel_size # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 - if self.zero1_parallel_size <= 0: - self.zero1_parallel_size = self.data_parallel_size - assert ( self.data_parallel_size % self.config.model.get("num_experts", 1) == 0 or self.config.model.get("num_experts", 1) % self.data_parallel_size == 0 @@ -496,6 +505,8 @@ def init_parallel_groups(self): initializer_args = [ rank, world_size, + self.weight_parallel_size, + self.sequence_parallel_size, self.data_parallel_size, self.pipeline_parallel_size, self.tensor_parallel_size, @@ -506,7 +517,10 @@ def init_parallel_groups(self): # run initialization of different process groups initializers = [] + initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args)) + initializers.append(pgroup_initializer.Initializer_Sequence(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Data(*initializer_args)) + initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Model(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) @@ -573,6 +587,7 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False): if dpseed_with_tpoffset: dp_seed = seed + pipeline_offset * 1024 add_seed(ParallelMode.DATA, dp_seed) + add_seed(ParallelMode.WEIGHT_DATA, dp_seed) add_seed(ParallelMode.DUMMY, dp_seed) # model parallel seeds are different across ranks diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py index e9afa2ec..ee81ac58 100644 --- a/internlm/core/context/process_group_initializer.py +++ b/internlm/core/context/process_group_initializer.py @@ -51,6 +51,15 @@ class ParallelMode(Enum): # dummy mode, only used during mode construction DUMMY = "dummy" + # weight parallel + WEIGHT = "weight" + + # weight data parallel + WEIGHT_DATA = "weight_data" + + # sequence parallel + SEQUENCE = "sequence" + class ProcessGroupInitializer(ABC): """An object, knowing the parallelism configuration, that initializes parallel groups. @@ -69,6 +78,8 @@ def __init__( self, rank: int, world_size: int, + weight_parallel_size: int, + sequence_parallel_size: int, data_parallel_size: int, pipeline_parallel_size: int, tensor_parallel_size: int, @@ -78,6 +89,8 @@ def __init__( ): self.rank = rank self.world_size = world_size + self.weight_parallel_size = weight_parallel_size + self.sequence_parallel_size = sequence_parallel_size self.data_parallel_size = data_parallel_size self.pipeline_parallel_size = pipeline_parallel_size self.tensor_parallel_size = tensor_parallel_size @@ -91,59 +104,59 @@ def init_dist_group(self, use_cpu: bool = False): pass -class Initializer_Data(ProcessGroupInitializer): - """A ProcessGroupInitializer for data parallelism. - - Args: - rank (int): The rank of current process. - world_size (int): Size of whole communication world. - data_parallel_size (int): Size of data parallel. - pipeline_parallel_size (int): Size of pipeline parallel. - tensor_parallel_size (int): Size of tensor parallel. - zero1_parallel_size (int): Size of zero1 parallel. - expert_parallel_size (int): Size of expert parallel. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.rank_num_per_dp_group = self.world_size // self.data_parallel_size - - assert self.world_size % self.data_parallel_size == 0 - - def init_dist_group(self, use_cpu: bool = False): - """Initialize data parallel groups, and assign local_ranks and groups to each gpu. - - Returns: - Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): - A Data parallelism's information tuple. - """ - local_rank = None - ranks_in_group = None - process_group = None - cpu_group = None - group_world_size = None - mode = ParallelMode.DATA - - for i in range(self.rank_num_per_dp_group): - ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None - - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks - - return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode +# class Initializer_Data(ProcessGroupInitializer): +# """A ProcessGroupInitializer for data parallelism. + +# Args: +# rank (int): The rank of current process. +# world_size (int): Size of whole communication world. +# data_parallel_size (int): Size of data parallel. +# pipeline_parallel_size (int): Size of pipeline parallel. +# tensor_parallel_size (int): Size of tensor parallel. +# zero1_parallel_size (int): Size of zero1 parallel. +# expert_parallel_size (int): Size of expert parallel. +# """ + +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# self.rank_num_per_dp_group = self.world_size // self.data_parallel_size + +# assert self.world_size % self.data_parallel_size == 0 + +# def init_dist_group(self, use_cpu: bool = False): +# """Initialize data parallel groups, and assign local_ranks and groups to each gpu. + +# Returns: +# Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): +# A Data parallelism's information tuple. +# """ +# local_rank = None +# ranks_in_group = None +# process_group = None +# cpu_group = None +# group_world_size = None +# mode = ParallelMode.DATA + +# for i in range(self.rank_num_per_dp_group): +# ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)] +# group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) +# if use_cpu: +# group_cpu = ( +# dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) +# if dist.get_backend() != "gloo" +# else group +# ) +# else: +# group_cpu = None + +# if self.rank in ranks: +# local_rank = ranks.index(self.rank) +# group_world_size = len(ranks) +# process_group = group +# cpu_group = group_cpu +# ranks_in_group = ranks + +# return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode class Initializer_Model(ProcessGroupInitializer): @@ -329,6 +342,8 @@ class Initializer_Zero1(ProcessGroupInitializer): Args: rank (int): The rank of current process. world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. @@ -338,11 +353,12 @@ class Initializer_Zero1(ProcessGroupInitializer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.rank_num_per_dp_group = self.world_size // self.data_parallel_size - self.num_zero1_parallel_group = self.data_parallel_size // self.zero1_parallel_size + self.num_zero1_parallel_group = self.world_size // self.zero1_parallel_size + self.weight_zero1_size = self.weight_parallel_size * self.zero1_parallel_size + self.num_weight_zero1_parallel_group = self.world_size // self.weight_zero1_size - assert self.world_size % self.data_parallel_size == 0 assert self.world_size % self.zero1_parallel_size == 0 + assert self.world_size % self.weight_zero1_size == 0 def init_dist_group(self, use_cpu: bool = False): """Initialize zero1 parallel groups, and assign local_ranks and groups to each gpu. @@ -350,6 +366,11 @@ def init_dist_group(self, use_cpu: bool = False): Returns: Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): A zero1 parallelism's information tuple. + + n=32 wp=8 sp=4 zo1=2 + wp grops: [0-7] [8-15] [16-23] [24-31] + zo1 groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + [16,24] [17,25] [18,26] [19,27] [20,28] [21,29] [22,30] [23,31] """ local_rank = None ranks_in_group = None @@ -358,10 +379,10 @@ def init_dist_group(self, use_cpu: bool = False): group_world_size = None mode = ParallelMode.ZERO1 - for i in range(self.rank_num_per_dp_group): - for j in range(self.num_zero1_parallel_group): + for i in range(self.num_weight_zero1_parallel_group): + for j in range(self.weight_parallel_size): ranks = [ - i + (j * self.zero1_parallel_size + k) * self.rank_num_per_dp_group + i * self.weight_zero1_size + j + k * self.weight_parallel_size for k in range(self.zero1_parallel_size) ] group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) @@ -658,3 +679,242 @@ def init_dist_group(self, use_cpu: bool = False): ranks_in_group = ranks return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode + + +class Initializer_Weight(ProcessGroupInitializer): + """A ProcessGroupInitializer for model weight parallelism. + + Args: + rank (int): The rank of current process. + world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + sequence_parallel_size (int): Size of data sequence parallel. + data_parallel_size (int): Size of data parallel. + pipeline_parallel_size (int): Size of pipeline parallel. + tensor_parallel_size (int): Size of tensor parallel. + zero1_parallel_size (int): Size of zero1 parallel. + expert_parallel_size (int): Size of expert parallel. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_weight_parallel_group = self.world_size // self.weight_parallel_size + + assert self.world_size % self.weight_parallel_size == 0 + + def init_dist_group(self, use_cpu: bool = False): + """Initialize model weight parallel groups, and assign local_ranks and groups to each gpu. + + Returns: + Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): + A Weight parallelism's information tuple. + """ + local_rank = None + ranks_in_group = None + process_group = None + cpu_group = None + group_world_size = None + mode = ParallelMode.WEIGHT + + for i in range(self.num_weight_parallel_group): + ranks = [i * self.weight_parallel_size + j for j in range(self.weight_parallel_size)] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks + + return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode + + +class Initializer_Sequence(ProcessGroupInitializer): + """A ProcessGroupInitializer for data sequence parallelism. + + Args: + rank (int): The rank of current process. + world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + sequence_parallel_size (int): Size of data sequence parallel. + data_parallel_size (int): Size of data parallel. + pipeline_parallel_size (int): Size of pipeline parallel. + tensor_parallel_size (int): Size of tensor parallel. + zero1_parallel_size (int): Size of zero1 parallel. + expert_parallel_size (int): Size of expert parallel. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_sequence_parallel_group = self.world_size // self.sequence_parallel_size + + assert self.world_size % self.sequence_parallel_size == 0 + + def init_dist_group(self, use_cpu: bool = False): + """Initialize data sequence parallel groups, and assign local_ranks and groups to each gpu. + + Returns: + Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): + A Sequence parallelism's information tuple. + """ + local_rank = None + ranks_in_group = None + process_group = None + cpu_group = None + group_world_size = None + mode = ParallelMode.SEQUENCE + + for i in range(self.num_sequence_parallel_group): + ranks = [i * self.sequence_parallel_size + j for j in range(self.sequence_parallel_size)] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks + + return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode + + +class Initializer_Data(ProcessGroupInitializer): + """A ProcessGroupInitializer for data parallelism. + + Args: + rank (int): The rank of current process. + world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + sequence_parallel_size (int): Size of data sequence parallel. + data_parallel_size (int): Size of data parallel. + pipeline_parallel_size (int): Size of pipeline parallel. + tensor_parallel_size (int): Size of tensor parallel. + zero1_parallel_size (int): Size of zero1 parallel. + expert_parallel_size (int): Size of expert parallel. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_dp_group = self.sequence_parallel_size + + assert self.world_size % self.data_parallel_size == 0 + assert self.world_size % self.sequence_parallel_size == 0 + + def init_dist_group(self, use_cpu: bool = False): + """Initialize data parallel groups, and assign local_ranks and groups to each gpu. + + Returns: + Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): + A Data parallelism's information tuple. + + n=32 wp=8 sp=4 zo1=2 + wp grops: [0-7] [8-15] [16-23] [24-31] + data groups: [0,4,8,12,16,20,24,28] [1,5,9,13,17,21,25,29] [2,6,10,14,18,22,26,30] [3,7,11,15,19,23,27,31] + """ + local_rank = None + ranks_in_group = None + process_group = None + cpu_group = None + group_world_size = None + mode = ParallelMode.DATA + + for i in range(self.num_dp_group): + ranks = [i + j * self.sequence_parallel_size for j in range(self.data_parallel_size)] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks + + return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode + + +class Initializer_Weight_Data(ProcessGroupInitializer): + """A ProcessGroupInitializer for common weight's data parallelism. + + Args: + rank (int): The rank of current process. + world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + sequence_parallel_size (int): Size of data sequence parallel. + data_parallel_size (int): Size of data parallel. + pipeline_parallel_size (int): Size of pipeline parallel. + tensor_parallel_size (int): Size of tensor parallel. + zero1_parallel_size (int): Size of zero1 parallel. + expert_parallel_size (int): Size of expert parallel. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_weight_dp_group = self.weight_parallel_size + self.weight_data_parallel_size = self.world_size // self.num_weight_dp_group + + assert self.world_size % self.weight_parallel_size == 0 + + def init_dist_group(self, use_cpu: bool = False): + """Initialize weight's data parallel groups, and assign local_ranks and groups to each gpu. + + Returns: + Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): + A WEIGHT_DATA parallelism's information tuple. + + n=32 wp=8 sp=4 zo1=2 + wp grops: [0-7] [8-15] [16-23] [24-31] + weight data groups: [0,8,16,24] [1,9,17,25] [2,10,18,26] [3,11,19,27] + [4,12,20,28] [5,13,21,29] [6,14,22,30] [7,15,23,31] + """ + local_rank = None + ranks_in_group = None + process_group = None + cpu_group = None + group_world_size = None + mode = ParallelMode.WEIGHT_DATA + + for i in range(self.num_weight_dp_group): + ranks = [i + j * self.weight_parallel_size for j in range(self.weight_data_parallel_size)] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks + + return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 4eef4ded..208af18f 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -8,9 +8,11 @@ from typing import Dict, Union import torch +from torch.distributed import get_rank from internlm.core.context import Config from internlm.core.context import global_context as gpc +from internlm.core.context.process_group_initializer import ParallelMode from internlm.monitor import initialize_light_monitor from internlm.utils.common import get_master_node from internlm.utils.logger import get_logger @@ -436,6 +438,11 @@ def launch( f"number of local experts: {gpc.config.model.num_experts//gpc.expert_parallel_size}" ) + print( + f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} weight_dp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}", + flush=True, + ) + def launch_from_slurm( config: Union[str, Path, Config, Dict], diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py index d1770538..ad6823b6 100644 --- a/internlm/model/embedding.py +++ b/internlm/model/embedding.py @@ -17,6 +17,52 @@ from .utils import gather_forward_split_backward, split_forward_gather_backward +# class Embedding1D(nn.Module): +# """ +# 1D Embedding. + +# Args: +# num_embeddings (int): The size of vocab. +# embedding_dim (int): The dimention of model. +# padding_idx (int): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient; +# therefore, the embedding vector at :attr:`padding_idx` is not updated during training, +# i.e. it remains as a fixed "pad". None by default. +# dtype (Optional[torch.dtype]): Data type None by default. + +# """ + +# def __init__( +# self, +# num_embeddings: int, +# embedding_dim: int, +# *args, +# padding_idx: int = None, +# dtype: torch.dtype = None, +# **kwargs, +# ): +# super().__init__() + +# self.num_embeddings = num_embeddings +# self.embed_dim = embedding_dim +# embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size + +# self.padding_idx = padding_idx +# self.embed_args = args +# self.embed_kwargs = kwargs + +# self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype)) + +# def forward(self, input_: Tensor) -> Tensor: +# output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) + +# output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1) + +# if gpc.config.parallel.sequence_parallel: +# output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1) + +# return output + + class Embedding1D(nn.Module): """ 1D Embedding. @@ -44,7 +90,7 @@ def __init__( self.num_embeddings = num_embeddings self.embed_dim = embedding_dim - embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size + embed_dim_per_partition = embedding_dim // gpc.weight_parallel_size self.padding_idx = padding_idx self.embed_args = args @@ -53,12 +99,10 @@ def __init__( self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype)) def forward(self, input_: Tensor) -> Tensor: - output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) - - output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1) + input_ = split_forward_gather_backward(input_, ParallelMode.SEQUENCE, dim=1) - if gpc.config.parallel.sequence_parallel: - output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1) + weight = gather_forward_split_backward(self.weight, ParallelMode.WEIGHT, dim=-1) + output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) return output diff --git a/internlm/model/linear.py b/internlm/model/linear.py index b92b2ee5..0948ee9c 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -77,6 +77,29 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 ) +class FSTPScaleColumnParallelLinear(BaseScaleColumnParallelLinear): + """ + ScaleColumnParallelLinear in flash implementation. + """ + + def forward(self, input, gather_dim=0): # pylint: disable=W0622 + # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + # we do an all_gather of x before doing the matmul. + # If not, then the input is already gathered. + if self.weight_scale != 1: + weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() + else: + weight = self.weight + return fstp_fused_dense_func( + input, + weight, + self.bias, + process_group=self.process_group, + module=self, + handler=gpc.fstp_handler, + ) + + class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear): """ ScaleColumnParallelLinear in megatron implementation. diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index bd335c1a..4cb20999 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -9,7 +9,7 @@ from flash_attn.modules.mlp import ParallelFusedMLP from torch import nn -from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, ParallelMode +from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal from internlm.model.embedding import Embedding1D @@ -17,6 +17,7 @@ MegatronScaleColumnParallelLinear, RewardModelLinear, ScaleColumnParallelLinear, + FSTPScaleColumnParallelLinear, get_mlp_cls, ) from internlm.model.multi_head_attention import MHA @@ -90,7 +91,8 @@ def __init__( self.mixer = MHA( embed_dim=hidden_size, num_heads=num_attention_heads, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(ParallelMode.WEIGHT), + sequence_process_group=gpc.get_group(ParallelMode.SEQUENCE), dropout=attn_drop_rate, max_position_embeddings=max_position_embeddings, softmax_scale=1 / math.sqrt(head_dim), @@ -119,7 +121,7 @@ def __init__( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(ParallelMode.WEIGHT), bias=False, device=device, dtype=dtype, @@ -142,6 +144,8 @@ def __init__( for _, param in self.mlp.named_parameters(): if gpc.get_world_size(ParallelMode.TENSOR) > 1: setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.WEIGHT) > 1: + setattr(param, IS_WEIGHT_PARALLEL, True) for param in self.norm1.parameters(): if gpc.config.parallel.sequence_parallel is True: setattr(param, IS_SEQUENCE_PARALLEL, True) @@ -312,11 +316,12 @@ def __init__( if is_reward: head_cls = RewardModelLinear else: - head_cls = ( - ScaleColumnParallelLinear - if self.sp_mode in ["flash-attn", "none", "intern"] - else MegatronScaleColumnParallelLinear - ) + # head_cls = ( + # ScaleColumnParallelLinear + # if self.sp_mode in ["flash-attn", "none", "intern"] + # else MegatronScaleColumnParallelLinear + # ) + head_cls = FSTPScaleColumnParallelLinear if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -335,6 +340,8 @@ def __init__( normal_(std=0.0052)(param) if gpc.get_world_size(ParallelMode.TENSOR) > 1: setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.WEIGHT) > 1: + setattr(param, IS_WEIGHT_PARALLEL, True) self.embed_grad_scale = embed_grad_scale self.blocks = nn.ModuleList( [ @@ -370,7 +377,7 @@ def __init__( self.head = head_cls( in_features=hidden_size, out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(ParallelMode.WEIGHT), bias=False, device=device, dtype=dtype, @@ -380,6 +387,8 @@ def __init__( normal_(std=0.0052)(param) if gpc.get_world_size(ParallelMode.TENSOR) > 1: setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.WEIGHT) > 1: + setattr(param, IS_WEIGHT_PARALLEL, True) for param in self.norm.parameters(): if gpc.config.parallel.sequence_parallel is True: setattr(param, IS_SEQUENCE_PARALLEL, True) @@ -410,6 +419,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension. if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern": indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) + if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1: + indexes = split_forward_gather_backward(indexes, ParallelMode.SEQUENCE, dim=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None @@ -431,6 +442,8 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N else: # Training hidden_states = self.head(hidden_states, gather_dim=0) + hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.SEQUENCE, dim=0) + if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) return hidden_states diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 93dbf010..d06cd967 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -10,6 +10,8 @@ import torch.nn.functional as F from einops import rearrange +from internlm.core.context import IS_WEIGHT_PARALLEL + try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func except ImportError: @@ -160,6 +162,7 @@ def __init__( embed_dim: int, num_heads: int, process_group: Optional[torch.distributed.ProcessGroup], + sequence_process_group: Optional[torch.distributed.ProcessGroup], max_position_embeddings: int = 2048, dropout: float = 0.0, softmax_scale: float = None, @@ -216,8 +219,10 @@ def __init__( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) if sp_mode == "intern": - self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=process_group) - self.inner_cross_attn = DistributedAttention(self.inner_cross_attn, sequence_process_group=process_group) + self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group) + self.inner_cross_attn = DistributedAttention( + self.inner_cross_attn, sequence_process_group=sequence_process_group + ) # output projection always have the bias (for now) out_proj_cls = get_linear_cls(sp_mode, "row") @@ -234,6 +239,10 @@ def __init__( for name in ["out_proj", "Wqkv"]: for param in getattr(self, name).parameters(): setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.WEIGHT) > 1: + for name in ["out_proj", "Wqkv"]: + for param in getattr(self, name).parameters(): + setattr(param, IS_WEIGHT_PARALLEL, True) def forward(self, x, seqlen=None, inference_params=None, **kwargs): if kwargs.get("indexes", None) is not None: diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 715fa467..b2131a74 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -11,7 +11,7 @@ from internlm.core.naive_amp import NaiveAMPModel from internlm.core.scheduler import SchedulerHook from internlm.model.embedding import Embedding1D -from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear +from internlm.model.linear import FSTPLinear, FSTPScaleColumnParallelLinear from internlm.model.utils import ( all_gather_raw, all_gather_raw_bias_memory_pool, @@ -55,7 +55,11 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non _chunk = _chunk.model for _chunk_name, children in _chunk.named_children(): - if isinstance(children, ScaleColumnParallelLinear): + if isinstance(children, FSTPScaleColumnParallelLinear): + setattr(children, "_fstp_name", "head") + setattr(children.weight, "_fstp_reduce_scatter_str", f"head.weight") + if children.bias is not None: + setattr(children.bias, "_fstp_reduce_scatter_str", f"head.bias") self.head.append(children) elif isinstance(children, Embedding1D): self.embedding.append(children) @@ -164,7 +168,7 @@ def _get_bias_from_memory_pool(self, module: nn.Module): return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] def get_weight_all_gather(self, module): - if self.enable_memory_pool: + if self.enable_memory_pool and getattr(module, "_fstp_name") != "head": return self._get_weight_from_memory_pool(module) else: return self.weight_global_output[module] @@ -201,7 +205,7 @@ def release_reduce_scatter_memory(self, key, index): self.reduce_scatter_memory_pool[key][index].idle = True def _all_gather_module_weight(self, module): - if self.enable_memory_pool: + if self.enable_memory_pool and getattr(module, "_fstp_name") != "head": if module.bias is not None: bias_handle = all_gather_raw_bias_memory_pool( module.bias, @@ -319,6 +323,16 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: _clear_handle(module) _clear_weight(module) + def _pre_hook_for_head(module: nn.Module, inputs: Any): # pylint: disable=W0613 + if module not in self.weight_global_handle: + self._all_gather_module_weight(module) + + _wait_handle(module) + + def _post_hook_for_head(module, grad_input, grad_output): # pylint: disable=W0613 + _clear_handle(module) + _clear_weight(module) + # register forward hooks # 1. register post_forward_hook @embedding module to prefetch for block 0 # 2. register pre_forward_hook @out_proj module to prefetch for next block, @@ -354,6 +368,12 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: module.register_full_backward_pre_hook(_pre_backward_hook_for_module) module.register_full_backward_hook(_post_backward_hook_for_module) + for head in self.head: + head.register_forward_pre_hook(_pre_hook_for_head) + head.register_full_backward_pre_hook(_pre_hook_for_head) + head.register_forward_hook(_post_hook_for_head) + head.register_full_backward_hook(_post_hook_for_head) + class FSTPOverlapSchedulerHook(SchedulerHook): """ diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 45d2f51a..89980c07 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -494,7 +494,7 @@ def forward( x = x.to(dtype=torch.get_autocast_gpu_dtype()) total_x = x.contiguous() - world_size = gpc.get_world_size(ParallelMode.TENSOR) + world_size = gpc.get_world_size(ParallelMode.WEIGHT) if world_size > 1: # do all_gather for weight and bias before actual computation if overlap_handler is not None: @@ -556,7 +556,7 @@ def backward(ctx, grad_output, *args): batch_dim = batch_shape.numel() grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - world_size = gpc.get_world_size(ParallelMode.TENSOR) + world_size = gpc.get_world_size(ParallelMode.WEIGHT) if world_size > 1: if overlap_handler is not None: total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py new file mode 100644 index 00000000..7ab9823b --- /dev/null +++ b/internlm/solver/optimizer/hybrid_zero_optim2.py @@ -0,0 +1,983 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import math +from functools import partial +from typing import List, Optional + +import torch +import torch.distributed as dist +from torch.optim import Optimizer + +from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode +from internlm.core.context import global_context as gpc +from internlm.monitor import send_alert_message +from internlm.solver.optimizer.store import ( + BucketStore, + GradientStore, + ParameterStore, + TensorBucket, +) +from internlm.solver.optimizer.utils import ( + DynamicGradScaler, + ParamBcastSyncHandler, + flatten, + get_grad_accumulate_object, + has_inf_or_nan, + reduce_tensor, + release_param_grad, + split_half_float_double, + sync_param, +) +from internlm.utils.common import get_current_device +from internlm.utils.logger import get_logger +from internlm.utils.megatron_timers import megatron_timer as timer +from internlm.utils.timeout import llm_timeout + +from .base_optimizer import BaseOptimizer +from .utils import compute_layer_norm, compute_norm, compute_param_norm + +inf = math.inf +logger = get_logger(__file__) + + +class HybridZeroOptimizer2(BaseOptimizer): + """ + Hybrid Zero Optimizer. + """ + + def __init__( + self, + optimizer: Optimizer, + cpu_offload=False, + grad_scal_cfg: Config = None, + zero_cfg: Config = None, + param_bcast_sync_handler: ParamBcastSyncHandler = None, + ): + # DynamicGradScaler related args + if gpc.config.model.dtype is torch.float32: + initial_scale = 1 + else: + initial_scale = grad_scal_cfg.fp16.initial_scale + min_scale = grad_scal_cfg.fp16.min_scale + growth_interval = grad_scal_cfg.fp16.growth_interval + growth_factor = grad_scal_cfg.growth_factor + backoff_factor = grad_scal_cfg.backoff_factor + hysteresis = grad_scal_cfg.hysteresis + max_scale = grad_scal_cfg.max_scale + + # Zero related args + reduce_bucket_size = zero_cfg.reduce_bucket_size + clip_grad_norm = zero_cfg.clip_grad_norm + self._overlap_sync_grad = zero_cfg.overlap_sync_grad + self._overlap_sync_param = zero_cfg.overlap_sync_param + + super().__init__(optim=optimizer) + + self._cpu_offload = cpu_offload + self._zero_local_rank = [] + self._zero_world_size = [] + self._broadcast_parallel_mode = [] + + # ParameterStore will manage the tensor buffers used for zero + # it will not manage the tensors used by mixed precision training + self._param_store = ParameterStore(ParallelMode.ZERO1) + self._grad_store = GradientStore(ParallelMode.WEIGHT_DATA) + self._bucket_store: List[BucketStore] = [] + self._accum_grad_buckets: List[BucketStore] = [] + self._bucket_in_progress = [] + + # fp16 and fp32 params for mixed precision training + self._fp16_param_groups = dict() + self._fp32_flat_param_groups_of_current_rank = dict() + + # communication params + # self._overlap_communication = overlap_communication + self._reduce_bucket_size = reduce_bucket_size + + self._comm_bcast_stream = torch.cuda.Stream() + + # gradient scaler + self.grad_scaler = DynamicGradScaler( + initial_scale=initial_scale, + min_scale=min_scale, + growth_factor=growth_factor, + backoff_factor=backoff_factor, + growth_interval=growth_interval, + hysteresis=hysteresis, + max_scale=max_scale, + ) + self._found_overflow = torch.cuda.FloatTensor([0], device=get_current_device()) + + # gradient clipping + self._clip_grad_norm = clip_grad_norm + + # need to record the rank in which parameter groups are not assigned parameters. + self.param_group_has_params = [] + self.param_group_no_params_ranks = [] + self.padding_grad = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device()) + self.padding_tensor = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device()) + + self.rank_unique_id = ( + f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_" + + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_" + + f"sp-{gpc.get_local_rank(ParallelMode.SEQUENCE)}_" + + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_" + + f"wdp-{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}_" + + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt" + ) + self.params_per_rank_id_dict = [] + self._param_bcast_sync_handler = param_bcast_sync_handler + if self._overlap_sync_param: + assert self._param_bcast_sync_handler is not None + + if gpc.config.parallel.weight >= 1 and gpc.config.parallel["weight"]["overlap"] is True: + self._fstp_handler = gpc.fstp_handler + else: + self._fstp_handler = None + + # iterate over the param group in the optimizer + # partition these param groups for data parallel training + # and add buffers to parameter store for future access + for group_id, param_group in enumerate(self.optim.param_groups): + group_params = param_group["params"] + + # set the dtype for each param group + param_group["dtype"] = group_params[0].dtype if len(group_params) != 0 else None + + # add the fp16 params to fp16_param_groups for bookkeeping + self._fp16_param_groups[group_id] = group_params + + # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1 + # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode + zero_mode = ( + ParallelMode.ZERO1 + if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA + else ParallelMode.EXPERT_DATA + ) + self._zero_local_rank.append(gpc.get_local_rank(zero_mode)) + self._zero_world_size.append(gpc.get_world_size(zero_mode)) + # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name + self._broadcast_parallel_mode.append(zero_mode) + self._bucket_store.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA)) + self._accum_grad_buckets.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA)) + + # assign parameters to ranks the params in the list are sorted + params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) + self.param_group_no_params_ranks.append(no_params_ranks) + self.param_group_has_params.append(self._zero_local_rank[group_id] not in no_params_ranks) + + # store the mapping between param to rank each param should belong to only one rank. + # we can skip the moe param and do not keep them in _param_store to save memory + # (means we need to deal with moe param in a different way), but it will increase + # complexity and reduce code readablity. + for rank, params in enumerate(params_per_rank): + # check whether any rank is not assigned params. + if len(params) != 0: + self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params) + for param in params: + setattr(param, "group_id", group_id) + self._param_store.set_param_to_rank(param, rank) + + # move to cpu to make room to create the flat tensor + for param in group_params: + param.data = param.data.cpu() + + # flatten the reordered tensors + for rank in range(self._zero_world_size[group_id]): + # No flat fp16 buffer is allocated if the process has no parameters. + if rank not in self.param_group_no_params_ranks[group_id]: + tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) + with torch.no_grad(): + flat_tensor = flatten(tensor_list) + flat_tensor = flat_tensor.data.cuda() + self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) + sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) + + # create a copy of fp32 weights of the parameters for which this rank is responsible + # No flat fp32 buffer is allocated if the process has no parameters. + if self.param_group_has_params[group_id]: + fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group( + self._zero_local_rank[group_id], group_id + ) + fp32_flat_current_rank = fp16_flat_current_rank.float() + device = "cpu" if self._cpu_offload else get_current_device() + fp32_flat_current_rank = fp32_flat_current_rank.to(device) + fp32_flat_current_rank.requires_grad = True + self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank + + # need to replace the params in the `params` field in the optimizer + # so that when the optimizer calls step(), it only updates the tensors + # managed by this data parallel rank + param_group["params"] = [fp32_flat_current_rank] + + # set reduction state + for param in self._fp16_param_groups[group_id]: + self._param_store.set_param_reduction_state(param, False) + + assert len(self._fp16_param_groups) != 0 + + # If a rank is not assigned any arguments, 'has_params' is False. + self.has_params = sum(self.param_group_has_params) != 0 + # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled. + self.skip_grad_reduce = False + + # reduction hook is only used if overlapping communication + # if it is stage 1 without overlapping, no hook will be attached + self._attach_reduction_hook() + + @property + def zero_local_rank(self): + return self._zero_local_rank + + @property + def zero_world_size(self): + return self._zero_world_size + + @property + def loss_scale(self): + return self.grad_scaler.scale + + @property + def num_param_groups(self): + return len(self._fp16_param_groups) + + def _partition_param_list(self, group_id, param_group): + no_params_ranks = [] + params_per_rank = [[] for _ in range(self._zero_world_size[group_id])] + numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])] + self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])]) + param_list = param_group["params"] + + sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True) + for i, param in enumerate(sorted_params): + global_id = str(i) + for j in range(len(param.size())): + global_id = "_".join([global_id, str(param.size()[j])]) + if self._overlap_sync_param: + rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param) + else: + rank_to_go = numel_per_rank.index(min(numel_per_rank)) + params_per_rank[rank_to_go].append(param) + self.params_per_rank_id_dict[-1][rank_to_go].append(global_id) + numel_per_rank[rank_to_go] += param.numel() + + # check whether any rank is not assigned to parameters. + for rank, params in enumerate(params_per_rank): + if len(params) == 0: + no_params_ranks.append(rank) + + if gpc.is_rank_for_log(): + logger.info( # pylint: disable=W1203 + f"Number of elements on ranks: {numel_per_rank}, rank:{gpc.get_global_rank()}" + ) + + return params_per_rank, set(no_params_ranks) + + def _is_moe_group(self, param_group): + return "moe" in param_group.keys() and param_group["moe"] + + def _is_norm_group(self, param_group): + return "norm" in param_group.keys() and param_group["norm"] + + def _is_gate_group(self, param_group): + return "gate" in param_group.keys() and param_group["gate"] + + # TODO check expert dp is correct when enable moe and overlap both + def _attach_reduction_hook(self): + # we iterate over the fp16 params + # on each param, we register a hook to its AccumulateGrad object + for group_id in range(self.num_param_groups): + param_group = self._fp16_param_groups[group_id] + for param in param_group: + # we should not reduce the param in moe + if not param.requires_grad: + continue + + reduce_rank = None + + def _define_and_attach(param, reduce_rank=None): + reduction_func = partial( + self._store_and_try_reduce_grads_by_bucket, + param=param, + reduce_rank=reduce_rank, + ) + + reduce_scatter_checker = partial( + self._wait_reduce_scatter_and_accumulate_grads, + param=param, + reduce_rank=reduce_rank, + ) + + def reduction_sp_func(): + handle = reduce_tensor( + param.grad, + dtype=None, + dst_rank=reduce_rank, + parallel_mode=ParallelMode.GLOBAL, + ) + handle.wait() + + # define hook + # NOT IMPORTANT BUT GOOD TO KNOW: + # args here is not grad, but allow_unreacable and accumulate_grad + def reduce_grad_hook(*args): # pylint: disable=W0613 + if self.skip_grad_reduce is False: + reduction_func() + + # define hook for real gradient accumulation. + def accum_grad_hook(*args): # pylint: disable=W0613 + reduce_scatter_checker() + + # define hook for sequence_parallel + def reduce_grad_hook_sp(*args): # pylint: disable=W0613 + if self.skip_grad_reduce is False: + reduction_sp_func() + + # get the AccumulateGrad object of the param itself + # If these objects are not kept, reduction hooks may not be attached successfully. + accum_grad_obj = get_grad_accumulate_object(param) + self._grad_store.add_accumulate_grad_object(accum_grad_obj) + + # if sequence_parallel is True, + # the grad of norm should be all-reduce across the tp process group + if ( + gpc.config.parallel.sequence_parallel is True + and hasattr(param, IS_SEQUENCE_PARALLEL) + and getattr(param, IS_SEQUENCE_PARALLEL) is True + ): + accum_grad_obj.register_hook(reduce_grad_hook_sp) + + # we should not only register for parameters which have _fstp_reduce_scatter_str attr. + # we must keep up with reduce_grad_hook. + if self._fstp_handler is not None: + accum_grad_obj.register_hook(accum_grad_hook) + + if self._overlap_sync_grad: + accum_grad_obj.register_hook(reduce_grad_hook) + + _define_and_attach(param, reduce_rank) + + def accumulate_left_grads_after_backward(self): + if self._fstp_handler is None: + return + + for group_id in range(self.num_param_groups): + self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id]) + + def belongs_to_current_rank(self, param) -> bool: + """ + Check whether a parameter is supposed to be updated by the process of the current rank + + :param tensor: A :class:`torch.Tensor` object + :type tensor: torch.Tensor + + :return: True if the parameter should be updated by the current rank. Otherwise false. + :rtype: bool + """ + tensor_rank = self._param_store.get_param_rank(param) + group_id = getattr(param, "group_id") + return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) + + def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: + for _param in bucket.get_param(reduce_rank): + if not hasattr(_param, "_fstp_reduce_scatter_str"): + continue + + # wait and accumulate gardient. + _key = getattr(_param, "_fstp_reduce_scatter_str") + _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key] + _comm_handle.wait() + _param.grad.add_(_grad) + + # release cuda memory. + if self._fstp_handler.enable_memory_pool: + self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) + _grad = None + self._fstp_handler.reduce_scatter_handlers[_key] = None + + bucket.reset_by_rank(reduce_rank) + + def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None): + param_size = param.numel() + + group_id = getattr(param, "group_id") + current_bucket = self._accum_grad_buckets[group_id] + + # check if the bucket is full + # if full, will reduce the grads already in the bucket + # after reduction, the bucket will be empty + if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: + self._accum_grads_store_in_bucket(current_bucket, reduce_rank) + + # otherwise, add the parameter into bucket. + current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) + current_bucket.add_param(param, reduce_rank) + + def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): + param_size = param.numel() + + # check if the bucket is full + # if full, will reduce the grads already in the bucket + # after reduction, the bucket will be empty + group_id = getattr(param, "group_id") + current_bucket = self._bucket_store[group_id] + + if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: + self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) + + # the param must not be reduced to ensure correctness + is_param_reduced = self._param_store.is_param_reduced(param) + if is_param_reduced: + msg = ( + f"Parameter of size ({param.size()}) has already been reduced, " + + "duplicate reduction will lead to arithmetic incorrectness" + ) + raise RuntimeError(msg) + + # the param must have grad for reduction + assert param.grad is not None, f"Parameter of size ({param.size()}) has None grad, cannot be reduced" + + current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) + current_bucket.add_grad(param.grad, reduce_rank) + current_bucket.add_param(param, reduce_rank) + + def _reduce_grads_stored_in_bucket(self, current_bucket, reduce_rank=None, last_bucket=False): + # reduce grads + self._reduce_grads_by_rank( + reduce_rank=reduce_rank, + grads=current_bucket.get_grad(reduce_rank=reduce_rank), + bucket_size=current_bucket.num_elements_in_bucket(reduce_rank), + group_id=current_bucket.get_param_group_id(), + dp_parallel_mode=current_bucket.get_dp_parallel_mode(), + ) + + params_in_bucket = current_bucket.get_param(reduce_rank=reduce_rank) + + for param in params_in_bucket: + # the is_param_reduced flag should be False showing that + # this param is not reduced before calling self._reduce_grads_by_rank + is_param_reduced = self._param_store.is_param_reduced(param) + + if is_param_reduced: + msg = ( + f"Parameter of size ({param.size()}) has been reduced, " + + "duplicate reduction will lead to arithmetic incorrectness" + ) + raise RuntimeError(msg) + + # update the flag + self._param_store.set_param_reduction_state(param, True) + + if self.belongs_to_current_rank(param): + self._param_store.add_reduced_param_for_compute_norm(param, last_bucket) + else: + self._param_store.add_previous_reduced_param(param) + + current_bucket.reset_by_rank(reduce_rank) + + def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size, group_id, dp_parallel_mode): + grad_buckets_by_dtype = split_half_float_double(grads) + next_bucket_list = [] + # add parameters into bucket for reduction + for tensor_list in grad_buckets_by_dtype: + param_bucket = TensorBucket(size=bucket_size) + for tensor in tensor_list: + param_bucket.add_to_bucket(tensor, allow_oversize=True) + if not param_bucket.is_empty(): + self._reduce_and_copy( + bucket=param_bucket, reduce_rank=reduce_rank, group_id=group_id, dp_parallel_mode=dp_parallel_mode + ) + next_bucket_list.append(param_bucket) + + # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy() + # here we can also overlap the communication with some memcpy operation caused by bucket.flatten() + for bucket in self._bucket_in_progress: + bucket.commu_handle.wait() + bucket.unflatten_and_copy() + bucket.empty() + self._bucket_in_progress = [] + self._param_store.clear_grads_of_previous_reduced_params() + + # after the completion of bucket list reduction, add new buckets into _bucket_in_progress + self._bucket_in_progress = next_bucket_list.copy() + + def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank, group_id, dp_parallel_mode): + # flatten the tensors and do allreduce + bucket.flatten() + bucket.commu_handle = reduce_tensor( + tensor=bucket.get_flat_tensor(), + dtype=None, + dst_rank=reduce_rank, + parallel_mode=dp_parallel_mode, + ) + + # update the reduced tensor + if reduce_rank is None or reduce_rank == self._zero_local_rank[group_id]: + bucket.set_unflatten_and_copy_flag(flag=True) + + def _has_inf_or_nan(self, tensor): + try: + tensor_mean = float(tensor.mean()) + except RuntimeError as instance: + # We want to check if inst is actually an overflow exception. + # RuntimeError could come from a different error. + # If so, we still want the exception to propagate. + if "value cannot be converted" not in instance.args[0]: + raise + return True + else: + if tensor_mean == float("inf") or tensor_mean == -float("inf"): + return True + return False + + def _sync_grad(self): + # update param already reduced flag + reduction_states = self._param_store.get_param_reduction_states() + for tensor, _ in reduction_states.items(): + reduction_states[tensor] = False + self._param_store.reset_reduced_data_for_compute_norm() + + # accumulate gradient + avg_gradients = self._grad_store._averaged_gradients + for group_id in range(self.num_param_groups): + # the following operations are performed only on the rank to which parameters are assigned. + if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: + param_group = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) + + if group_id not in avg_gradients: + avg_gradients[group_id] = [] + + param_idx = 0 + for param in param_group: + if param.grad is not None: + if len(avg_gradients[group_id]) == param_idx: + avg_gradients[group_id].append(param.grad) + else: + avg_gradients[group_id][param_idx].add_(param.grad) + param_idx += 1 + + # the gradients needed are stored in the avg_gradients buffer + # thus, can clear this + self.zero_grad() + + def zero_grad(self, set_to_none=True): + """ + Set parameter gradients to zero. If set_to_none = True, gradient + will be set to None to save memory. + + :param set_to_none: Whether set the gradient to None. Default value is True. + :type set_to_none: bool + """ + for _, param_group in self._fp16_param_groups.items(): + for param in param_group: + if set_to_none: + param.grad = None + elif param.grad is not None: + param.grad.detach() + param.grad.zero_() + else: + pass + + def backward(self, loss, retain_graph=False): + loss = self.loss_scale * loss + loss.backward(retain_graph=retain_graph) + + # Gradients may not be fully synchronized here. + + def _compute_norm_with_stage( + self, + group_id: int = 0, + last_bucket: bool = False, + last_stage: bool = False, + previous_norm=None, + ): + # compute norm for gradients that have been reduced + params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) + if len(params) == 0: + dtype = self.param_groups[group_id]["dtype"] + grads = [self.padding_grad.to(dtype)] + params = [self.padding_tensor.to(dtype)] + + norm = 0 + if self._clip_grad_norm > 0: + # this norm is before scaling, it will be very large + norm = compute_norm( + gradients=grads, + parameters=params, + last_stage=last_stage, + previous_norm=previous_norm, + zero_mode=self._broadcast_parallel_mode[group_id], + ) + + return norm + + def _compute_param_norm_stage( + self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_param_norms=None + ): + # compute norm for gradients that have been reduced + params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) + + total_param_norms = {} + if len(params) == 0: + dtype = self.param_groups[group_id]["dtype"] + grads = [self.padding_grad.to(dtype)] + params = [self.padding_tensor.to(dtype)] + + if self._clip_grad_norm > 0: + total_param_norms = compute_param_norm( + grads, + params, + last_stage=last_stage, + previous_param_norms=previous_param_norms, + zero_mode=self._broadcast_parallel_mode[group_id], + is_moe_group=self._is_moe_group(self.optim.param_groups[group_id]), + ) + return total_param_norms + + @llm_timeout(func_name="optim_step") + def step(self, closure=None): + """Performs a single optimization step. + + Args: + closure (Callable, optional): A closure that reevaluates the model + and returns the loss. + Returns: + Union[bool, float]: Whether the gradient is success updated, and the gradient. + """ + assert closure is None, "closure is not supported by step()" + + # if not overlapping communication (no reduction hook is attached) + # we need to manually reduce these gradients + if not self._overlap_sync_grad: + for group_id in range(len(self._fp16_param_groups)): + for param in self._fp16_param_groups[group_id]: + # we should not reduce the param in moe + if param.grad is not None: + self._store_and_try_reduce_grads_by_bucket(param) + + # we need to reduce the gradients left in the communication bucket + for group_id in range(self.num_param_groups): + self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) + + # compute norm for gradients in the before bucket + groups_norms = [] + groups_param_norms = [] + for group_id in range(self.num_param_groups): + groups_norms.append(self._compute_norm_with_stage(group_id=group_id)) + if gpc.config.get("grad_norm_profiling", False): + groups_param_norms.append(self._compute_param_norm_stage(group_id=group_id)) + + # clear reduced grads + # grads in the last bucket is reduced + for bucket in self._bucket_in_progress: + bucket.commu_handle.wait() + bucket.unflatten_and_copy() + bucket.empty() + self._bucket_in_progress = [] + self._param_store.clear_grads_of_previous_reduced_params() + # compute norm for gradients in the last bucket + total_norms = {} + total_param_norms = {} + total_layer_norms = {} + for group_id in range(self.num_param_groups): + group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default" + group_name = f"{group_id}_{group_name}" + total_norms[group_name] = self._compute_norm_with_stage( + group_id=group_id, + last_bucket=True, + last_stage=True, + previous_norm=groups_norms[group_id], + ) + if gpc.config.get("grad_norm_profiling", False): + param_norms = self._compute_param_norm_stage( + group_id=group_id, + last_bucket=True, + last_stage=True, + previous_param_norms=groups_param_norms[group_id], + ) + total_layer_norms[group_name], total_param_norms[group_name] = compute_layer_norm( + param_norms=param_norms, loss_scale=self.loss_scale.item() + ) + + # Need to allreduce(avg) the norms across different ranks because moe params will not be synced + # during allreduce + if self._is_moe_group(self.optim.param_groups[group_id]): + # model and zero have been reduced!!! + pg = gpc.get_group(ParallelMode.EXPERT) + scaled_norm = total_norms[group_name] * 1.0 / float(gpc.get_world_size(ParallelMode.EXPERT)) + scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float) + dist.all_reduce(scaled_norm_tensor, group=pg) + total_norms[group_name] = scaled_norm_tensor.item() + timer("sync_grad").start() + self._sync_grad() + timer("sync_grad").stop() + + state, global_norms = self._step(closure=closure, norms=total_norms) + if gpc.config.get("grad_norm_profiling", False): + global_norms["layer_norms"] = total_layer_norms + global_norms["param_norms"] = total_param_norms + + return state, global_norms + + def _step(self, closure=None, norms=None): + assert closure is None, "closure is not supported by step()" + + # check for overflow + found_inf = False + found_nan = False + # if there is INF values in grades, compute_norm func would also returns -1 + # thus, we try to avoid call _check_overflow here + # found_inf = self._check_overflow() + # Because you may encounter inf when computing norm + + if -1 in norms.values(): + found_inf = True + + if -2 in norms.values(): + found_nan = True + + loss_scale = float(self.loss_scale.item()) # backup + if gpc.config.model.dtype is not torch.float32: + self.grad_scaler.update(found_inf) + + # update loss scale if overflow occurs + if found_inf: + if gpc.is_rank_for_log(): + logger.warning("Overflow occurs, please check it.") + send_alert_message( + address=gpc.config.monitor.alert.feishu_alert_address, + message="Overflow occurs, please check it.", + ) + self._grad_store._averaged_gradients = dict() + self.zero_grad() + return False, norms + + if found_nan: + if gpc.is_rank_for_log(): + logger.warning("Nan grad norm occurs, please check it.") + send_alert_message( + address=gpc.config.monitor.alert.feishu_alert_address, + message="Nan grad norm occurs, please check it.", + ) + self._grad_store._averaged_gradients = dict() + self.zero_grad() + return False, norms + # copy the grad of fp16 param to fp32 param + single_grad_partition_groups = [] + for group_id in range(self.num_param_groups): + # compute norm + # The following operations are performed only on the rank to which parameters are assigned. + if not self.param_group_has_params[group_id]: + continue + + # create flat gradient for the flat fp32 params + gradients = self._grad_store.get_averaged_gradients_by_group(group_id) + with torch.no_grad(): + flat_fp16_avg_grads = flatten(gradients) + self._grad_store.reset_average_gradients_by_group(group_id) + gradients = None # release cuda memory + + dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype + flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) + flat_fp16_avg_grads = None # release cuda memory + + param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape + assert ( + param_shape == flat_fp32_avg_grads.shape + ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}" + + # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients. + # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors. + is_tp_sync_groups = ( + self._is_norm_group(self.optim.param_groups[group_id]), + self._is_gate_group(self.optim.param_groups[group_id]), + ) + if any(is_tp_sync_groups): + dist.all_reduce( + flat_fp32_avg_grads, + op=dist.ReduceOp.AVG, + group=gpc.get_group(ParallelMode.TENSOR), + ) + + single_grad_partition_groups.append(flat_fp32_avg_grads) + device = self._fp32_flat_param_groups_of_current_rank[group_id].device + self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device) + # unscale and clip grads + # get the global norm + global_norm_groups = {} + if self._clip_grad_norm > 0: + for group_name, norm in norms.items(): + global_norm_groups[group_name] = norm**0.5 + + # the following operations are performed only on the rank to which parameters are assigned. + if gpc.config.model.dtype is not torch.float32: + if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0: + self._unscale_and_clip_grads( + single_grad_partition_groups, + list(global_norm_groups.values()), + loss_scale, + ) + + # update the parameters + timer("step").start() + + # For those ranks that are not assigned parameters, we just wait for other ranks + # to send them updated their own parameters. + if self.has_params: + self.optim.step() + # release the fp32 grad + release_param_grad(self._fp32_flat_param_groups_of_current_rank.values()) + # update fp16 partition updated by the current rank + for group_id in range(len(self._fp16_param_groups)): + if self.param_group_has_params[group_id]: + fp16_param = self._param_store.get_flat_fp16_param_by_rank_group( + rank=self._zero_local_rank[group_id], group_id=group_id + ) + fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] + fp16_param.data.copy_(fp32_param) + torch.cuda.synchronize() + with torch.cuda.stream(self._comm_bcast_stream): + self.broadcast_params() + + timer("step").stop() + + # update gradients may not be needed here, because the sync_params function is used in initialization, + # so synchronization is maintained + for group_name, global_norm in global_norm_groups.items(): + global_norm_groups[group_name] = global_norm / loss_scale + return True, global_norm_groups + + def broadcast_params(self): + handles = [] + + for group_id in range(self.num_param_groups): + for rank in range(self._zero_world_size[group_id]): + # The following operations are performed only on the rank to which parameters are assigned. + if rank in self.param_group_no_params_ranks[group_id]: + continue + fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id) + # grank = gpc.get_ranks_in_group(group_type)[rank] # need to convert to the global rank + # assert grank == rank, f"{grank} == {rank}" + g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank] + handle = dist.broadcast( + fp16_param, + src=g_rank, + group=gpc.get_group(self._broadcast_parallel_mode[group_id]), + async_op=True, + ) + + if self._overlap_sync_param: + self._param_bcast_sync_handler.add_bcast_handle(rank, handle) + else: + handles.append(handle) + + for handle in handles: + handle.wait() + + torch.cuda.synchronize() + + ################## + # FP16 Utilities # + ################## + + def _check_overflow(self): + # clear previous overflow record + self._found_overflow.fill_(0.0) + + # check for overflow + for group_id in range(len(self._fp16_param_groups)): + # The following operations are performed only on the rank to which parameters are assigned. + if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: + for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id): + if avg_grad is not None and has_inf_or_nan(avg_grad): + self._found_overflow.fill_(1.0) + break + dist.all_reduce( + self._found_overflow, + op=dist.ReduceOp.MAX, + group=gpc.get_group(ParallelMode.GLOBAL), + ) + + return self._found_overflow.item() > 0 + + def _unscale_and_clip_grads(self, grad_groups_flat, total_norm_groups, loss_scale): + # compute combined scale factor for this group + combined_scale_groups = [] + + if self._clip_grad_norm > 0.0: + # norm is in fact norm*scale + for group_id, total_norm in enumerate(total_norm_groups): + combined_scale_groups.append(loss_scale) + clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm + if clip > 1.0: + combined_scale_groups[group_id] = clip * loss_scale + + for group_id, grad in enumerate(grad_groups_flat): + grad.data.mul_(1.0 / combined_scale_groups[group_id]) + + def clip_grad_norm(self, model, max_norm): + # will conduct in the step() + pass + + def state_dict(self): + states = {} + grad_scaler = self.grad_scaler.state_dict() + states["grad_scaler"] = grad_scaler + optim_states = self.optim.state_dict() + states["base_optim_states"] = optim_states + + flat_fp32_weights = {} + for group_id, param in self._fp32_flat_param_groups_of_current_rank.items(): + if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: + assert param.grad is None + flat_fp32_weights[group_id] = param + states["flat_fp32_weights"] = flat_fp32_weights + states["zero_devide_optim_plan"] = self.params_per_rank_id_dict + + return states + + def load_state_dict(self, states): + # TODO: Need to take into account the change in the number of DP. + assert "grad_scaler" in states, "Not found grad_scaler state!" + grad_scaler = states["grad_scaler"] + self.grad_scaler.load_state_dict(grad_scaler) + optim_states = states["base_optim_states"] + self.optim.load_state_dict(optim_states) + + # load fp32 model weight. + flat_fp32_weights = states["flat_fp32_weights"] + assert set(flat_fp32_weights.keys()) == set(self._fp32_flat_param_groups_of_current_rank) + for group_id, param in flat_fp32_weights.items(): + if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: + self_param = self._fp32_flat_param_groups_of_current_rank[group_id] + assert ( + self_param.shape == param.shape + ), f"The loaded parameter shape is inconsistent, {self_param.shape} != {param.shape}" + self_param.data.copy_(param.data) + + # Load the fp16 model weights. + for group_id in range(len(self._fp16_param_groups)): + if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: + fp16_param = self._param_store.get_flat_fp16_param_by_rank_group( + rank=self._zero_local_rank[group_id], group_id=group_id + ) + fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] + fp16_param.data.copy_(fp32_param) + + if "zero_devide_optim_plan" in states: + self.params_per_rank_id_dict = states["zero_devide_optim_plan"] + + +def reload_zero_fp32_buff(optimizer): + # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value. + # Or we must ensure that loading model weights must be done before zero is initialized. + if isinstance(optimizer, HybridZeroOptimizer): + for group_id, param_group in enumerate(optimizer.optim.param_groups): + if optimizer.param_group_has_params[group_id]: + # flatten fp16 params have already been updated by 'load_model_checkpoint' + fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group( + optimizer._zero_local_rank[group_id], group_id + ) + # param_group["params"] is fp32 flatten optimizer states of this zero rank. + param_group["params"][0].data.copy_(fp16_flat_current_rank.float()) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 982a2466..b2ea0391 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -17,7 +17,7 @@ from internlm.core.naive_amp import NaiveAMPModel from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda from internlm.utils.logger import get_logger -from internlm.utils.parallel import is_model_parallel_parameter +from internlm.utils.parallel import is_model_parallel_parameter, is_weight_parallel_parameter logger = get_logger(__file__) @@ -243,6 +243,14 @@ def append_grad(g, p): and gpc.get_local_rank(ParallelMode.TENSOR) == 0 ): # if not used in each chunk, such as layernorm append_grad(g, p) + elif ( + gpc.is_initialized(ParallelMode.WEIGHT) + and not is_weight_parallel_parameter(p) + and gpc.get_local_rank(ParallelMode.WEIGHT) == 0 + ): # if not used in each chunk, such as layernorm + append_grad(g, p) + elif is_weight_parallel_parameter(p): + append_grad(g, p) elif is_model_parallel_parameter(p): append_grad(g, p) elif gpc.get_local_rank(ParallelMode.TENSOR) != 0: @@ -312,11 +320,11 @@ def compute_norm( total_norm = total_norm + previous_norm # Sum across all model-parallel GPUs. - if gpc.is_initialized(ParallelMode.MODEL): + if gpc.is_initialized(ParallelMode.WEIGHT): dist.all_reduce( total_norm, op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.MODEL), + group=gpc.get_group(ParallelMode.WEIGHT), ) # This is because we use zero1, so we need to use this reduction. diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index a05f62df..96548f71 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -108,14 +108,14 @@ def initialize_model(): # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. - set_mode(ParallelMode.DATA) + set_mode(ParallelMode.WEIGHT_DATA) # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) gpc.fstp_handler = None - if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: - gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.TENSOR)) + if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True: + gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT)) return model diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 6e5384f5..f6e72cff 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -4,7 +4,7 @@ import torch.distributed as dist from torch import nn -from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode +from internlm.core.context import IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel @@ -13,13 +13,17 @@ def is_model_parallel_parameter(p): return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL) +def is_weight_parallel_parameter(p): + return hasattr(p, IS_WEIGHT_PARALLEL) and getattr(p, IS_WEIGHT_PARALLEL) + + def sync_model_param(model): r"""Make sure data parameters are consistent during Data Parallel Mode. Args: model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - if gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1: + if gpc.is_initialized(ParallelMode.WEIGHT_DATA) and gpc.get_world_size(ParallelMode.WEIGHT_DATA) > 1: sync_moe_param = ( gpc.is_initialized(ParallelMode.EXPERT_DATA) and gpc.get_world_size(ParallelMode.EXPERT_DATA) > 1 ) @@ -28,8 +32,8 @@ def sync_model_param(model): ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA) dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA)) else: - ranks = gpc.get_ranks_in_group(ParallelMode.DATA) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.DATA)) + ranks = gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.WEIGHT_DATA)) def sync_model_param_within_tp(model): diff --git a/train.py b/train.py index 789094ac..996d7465 100644 --- a/train.py +++ b/train.py @@ -220,7 +220,7 @@ def main(args): # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps): empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval) - torch.cuda.memory._record_memory_history() + # torch.cuda.memory._record_memory_history() start_time = time.time() timer("one-batch").start() @@ -327,7 +327,7 @@ def main(args): if batch_count % 2 == 0: prof.step() - torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") torch.cuda.reset_peak_memory_stats() ckpt_manager.wait_async_upload_finish() @@ -355,4 +355,4 @@ def main(args): alert_address=gpc.config.monitor.alert.feishu_alert_address, excp_info=traceback.format_exc() ) - torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") + # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") From cab9abd2589bd782af2ecf7b9e68967087ceb937 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 29 Nov 2023 16:59:01 +0800 Subject: [PATCH 078/153] fix(training_internlm.py): fix loss accuracy(optim init and seed set) --- internlm/core/context/parallel_context.py | 6 ++++++ internlm/model/overlap_handler.py | 2 +- internlm/solver/optimizer/__init__.py | 1 + .../solver/optimizer/hybrid_zero_optim2.py | 2 +- internlm/train/training_internlm.py | 16 +++++++++++++-- internlm/utils/parallel.py | 20 +++++++++++++++++++ 6 files changed, 43 insertions(+), 4 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 8d34f608..e25b3de6 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -596,12 +596,18 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False): tp_seed = seed + tp_rank + pipeline_offset * 1024 add_seed(ParallelMode.TENSOR, tp_seed) + if self.is_initialized(ParallelMode.WEIGHT): + wp_rank = self.get_local_rank(ParallelMode.WEIGHT) + wp_seed = seed + wp_rank + pipeline_offset * 1024 + add_seed(ParallelMode.WEIGHT, wp_seed) + # we do not set the random state mode to ParallelMode.DATA until model is built (instead, we use a dummy mode # during model construction), this is because the random state will be different in different tensor parallel # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform # additional random operations during the RowParallelLinear module building process. # set_mode(ParallelMode.DUMMY) set_mode(ParallelMode.TENSOR) + set_mode(ParallelMode.WEIGHT) seeds = get_seeds() seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()]) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index b2131a74..3d02e5d4 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -40,7 +40,7 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non self.head = [] self.embedding = [] self.model_checkpoint = gpc.config.model.checkpoint - self.enable_memory_pool = gpc.config.parallel["tensor"].get("memory_pool", False) + self.enable_memory_pool = gpc.config.parallel["weight"].get("memory_pool", False) self.is_forward = True self.reduce_scatter_handlers = {} diff --git a/internlm/solver/optimizer/__init__.py b/internlm/solver/optimizer/__init__.py index 7c6a1c64..309f2295 100644 --- a/internlm/solver/optimizer/__init__.py +++ b/internlm/solver/optimizer/__init__.py @@ -3,5 +3,6 @@ from .fsdp_optimizer import FSDPadaptOptimizer from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff +from .hybrid_zero_optim2 import HybridZeroOptimizer2 __all__ = ["FSDPadaptOptimizer", "HybridZeroOptimizer", "reload_zero_fp32_buff"] diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py index 7ab9823b..e2b3995a 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim2.py +++ b/internlm/solver/optimizer/hybrid_zero_optim2.py @@ -131,7 +131,7 @@ def __init__( if self._overlap_sync_param: assert self._param_bcast_sync_handler is not None - if gpc.config.parallel.weight >= 1 and gpc.config.parallel["weight"]["overlap"] is True: + if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True: self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 96548f71..f0421cd4 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -48,7 +48,7 @@ from internlm.monitor.monitor import monitor_manager as mm from internlm.solver.beta2_scheduler import Beta2Scheduler from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR -from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer +from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer, HybridZeroOptimizer2 from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups from internlm.utils.common import DummyProfile @@ -58,6 +58,7 @@ set_model_params_layer_name, sync_model_param, sync_model_param_within_tp, + sync_model_param_within_wp, ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout @@ -106,6 +107,8 @@ def initialize_model(): # the same across tensor parallelism. sync_model_param_within_tp(model) + sync_model_param_within_wp(model) + # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. set_mode(ParallelMode.WEIGHT_DATA) @@ -182,7 +185,14 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): eps=adam_cfg.adam_eps, ) - if not gpc.config.parallel.zero1.fsdp: + if gpc.config.parallel.weight.size > 1: + optimizer = HybridZeroOptimizer2( + naive_optimizer, + grad_scal_cfg=gpc.config.grad_scaler, + zero_cfg=gpc.config.hybrid_zero_optimizer, + param_bcast_sync_handler=param_bcast_sync_handler, + ) + elif not gpc.config.parallel.zero1.fsdp: optimizer = HybridZeroOptimizer( naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, @@ -608,6 +618,8 @@ def record_current_batch_training_metrics( tflops_list_2.append(tflops_2) if batch_count == gpc.config.data.total_steps - 1: print(tgs_list, flush=True) + if len(tgs_list) <= 0: + return avg_tgs = sum(tgs_list) / len(tgs_list) for tgs in tgs_list.copy(): if abs(tgs - avg_tgs) > 400: diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index f6e72cff..3399491c 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -56,6 +56,26 @@ def sync_model_param_within_tp(model): dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) +def sync_model_param_within_wp(model): + r"""This function is changed from colossalai, which is ``sync_model_param``. + + We modified this function to make sure it only sync parameters within tensor parallelism + but they are not splitted by tensor parallelism. + This function is used to make sure parameters that are not splitted by tensor parallelism + are the same across each tensor parallelism. + For example, parameters like RMSNorm, LayerNorm... + + Args: + model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. + """ + parallel_mode = ParallelMode.WEIGHT + if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1: + for param in model.parameters(): + if not is_weight_parallel_parameter(param): + ranks = gpc.get_ranks_in_group(parallel_mode) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) + + def get_parallel_log_file_name(): if gpc.is_rank_for_log(): fn_prefix = "main_" # Indicates a rank with more output information From d3ee3eff165e59b237740df005bee031a1315a05 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 30 Nov 2023 16:08:46 +0800 Subject: [PATCH 079/153] fix(model): reset embedding and head --- internlm/model/embedding.py | 54 +++-------------------------- internlm/model/linear.py | 23 ------------ internlm/model/modeling_internlm.py | 15 ++++---- internlm/model/overlap_handler.py | 27 +++------------ internlm/train/training_internlm.py | 28 +++++++++++++-- 5 files changed, 41 insertions(+), 106 deletions(-) diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py index ad6823b6..4be47d64 100644 --- a/internlm/model/embedding.py +++ b/internlm/model/embedding.py @@ -17,52 +17,6 @@ from .utils import gather_forward_split_backward, split_forward_gather_backward -# class Embedding1D(nn.Module): -# """ -# 1D Embedding. - -# Args: -# num_embeddings (int): The size of vocab. -# embedding_dim (int): The dimention of model. -# padding_idx (int): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient; -# therefore, the embedding vector at :attr:`padding_idx` is not updated during training, -# i.e. it remains as a fixed "pad". None by default. -# dtype (Optional[torch.dtype]): Data type None by default. - -# """ - -# def __init__( -# self, -# num_embeddings: int, -# embedding_dim: int, -# *args, -# padding_idx: int = None, -# dtype: torch.dtype = None, -# **kwargs, -# ): -# super().__init__() - -# self.num_embeddings = num_embeddings -# self.embed_dim = embedding_dim -# embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size - -# self.padding_idx = padding_idx -# self.embed_args = args -# self.embed_kwargs = kwargs - -# self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype)) - -# def forward(self, input_: Tensor) -> Tensor: -# output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) - -# output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1) - -# if gpc.config.parallel.sequence_parallel: -# output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1) - -# return output - - class Embedding1D(nn.Module): """ 1D Embedding. @@ -99,10 +53,12 @@ def __init__( self.weight = nn.Parameter(torch.empty((num_embeddings, embed_dim_per_partition), dtype=dtype)) def forward(self, input_: Tensor) -> Tensor: - input_ = split_forward_gather_backward(input_, ParallelMode.SEQUENCE, dim=1) + output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) + + output = gather_forward_split_backward(output_parallel, ParallelMode.WEIGHT, dim=-1) - weight = gather_forward_split_backward(self.weight, ParallelMode.WEIGHT, dim=-1) - output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) + if gpc.config.parallel.sequence > 1: + output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1) return output diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 0948ee9c..b92b2ee5 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -77,29 +77,6 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 ) -class FSTPScaleColumnParallelLinear(BaseScaleColumnParallelLinear): - """ - ScaleColumnParallelLinear in flash implementation. - """ - - def forward(self, input, gather_dim=0): # pylint: disable=W0622 - # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: - # we do an all_gather of x before doing the matmul. - # If not, then the input is already gathered. - if self.weight_scale != 1: - weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() - else: - weight = self.weight - return fstp_fused_dense_func( - input, - weight, - self.bias, - process_group=self.process_group, - module=self, - handler=gpc.fstp_handler, - ) - - class MegatronScaleColumnParallelLinear(BaseScaleColumnParallelLinear): """ ScaleColumnParallelLinear in megatron implementation. diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 4cb20999..1797f677 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -17,7 +17,6 @@ MegatronScaleColumnParallelLinear, RewardModelLinear, ScaleColumnParallelLinear, - FSTPScaleColumnParallelLinear, get_mlp_cls, ) from internlm.model.multi_head_attention import MHA @@ -32,6 +31,7 @@ from internlm.utils.logger import get_logger from internlm.utils.registry import MODEL_INITIALIZER + MODEL_TYPE = "INTERNLM" logger = get_logger(__file__) @@ -316,12 +316,11 @@ def __init__( if is_reward: head_cls = RewardModelLinear else: - # head_cls = ( - # ScaleColumnParallelLinear - # if self.sp_mode in ["flash-attn", "none", "intern"] - # else MegatronScaleColumnParallelLinear - # ) - head_cls = FSTPScaleColumnParallelLinear + head_cls = ( + ScaleColumnParallelLinear + if self.sp_mode in ["flash-attn", "none", "intern"] + else MegatronScaleColumnParallelLinear + ) if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -442,8 +441,6 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N else: # Training hidden_states = self.head(hidden_states, gather_dim=0) - hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.SEQUENCE, dim=0) - if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) return hidden_states diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 3d02e5d4..086947f3 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -11,7 +11,7 @@ from internlm.core.naive_amp import NaiveAMPModel from internlm.core.scheduler import SchedulerHook from internlm.model.embedding import Embedding1D -from internlm.model.linear import FSTPLinear, FSTPScaleColumnParallelLinear +from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear from internlm.model.utils import ( all_gather_raw, all_gather_raw_bias_memory_pool, @@ -55,11 +55,8 @@ def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> Non _chunk = _chunk.model for _chunk_name, children in _chunk.named_children(): - if isinstance(children, FSTPScaleColumnParallelLinear): + if isinstance(children, ScaleColumnParallelLinear): setattr(children, "_fstp_name", "head") - setattr(children.weight, "_fstp_reduce_scatter_str", f"head.weight") - if children.bias is not None: - setattr(children.bias, "_fstp_reduce_scatter_str", f"head.bias") self.head.append(children) elif isinstance(children, Embedding1D): self.embedding.append(children) @@ -168,7 +165,7 @@ def _get_bias_from_memory_pool(self, module: nn.Module): return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] def get_weight_all_gather(self, module): - if self.enable_memory_pool and getattr(module, "_fstp_name") != "head": + if self.enable_memory_pool: return self._get_weight_from_memory_pool(module) else: return self.weight_global_output[module] @@ -205,7 +202,7 @@ def release_reduce_scatter_memory(self, key, index): self.reduce_scatter_memory_pool[key][index].idle = True def _all_gather_module_weight(self, module): - if self.enable_memory_pool and getattr(module, "_fstp_name") != "head": + if self.enable_memory_pool: if module.bias is not None: bias_handle = all_gather_raw_bias_memory_pool( module.bias, @@ -323,16 +320,6 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: _clear_handle(module) _clear_weight(module) - def _pre_hook_for_head(module: nn.Module, inputs: Any): # pylint: disable=W0613 - if module not in self.weight_global_handle: - self._all_gather_module_weight(module) - - _wait_handle(module) - - def _post_hook_for_head(module, grad_input, grad_output): # pylint: disable=W0613 - _clear_handle(module) - _clear_weight(module) - # register forward hooks # 1. register post_forward_hook @embedding module to prefetch for block 0 # 2. register pre_forward_hook @out_proj module to prefetch for next block, @@ -368,12 +355,6 @@ def _post_hook_for_head(module, grad_input, grad_output): # pylint: disable=W06 module.register_full_backward_pre_hook(_pre_backward_hook_for_module) module.register_full_backward_hook(_post_backward_hook_for_module) - for head in self.head: - head.register_forward_pre_hook(_pre_hook_for_head) - head.register_full_backward_pre_hook(_pre_hook_for_head) - head.register_forward_hook(_post_hook_for_head) - head.register_full_backward_hook(_post_hook_for_head) - class FSTPOverlapSchedulerHook(SchedulerHook): """ diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index f0421cd4..9658baec 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -390,6 +390,30 @@ def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: Trai return batch, train_iter +# def initialize_llm_profile(profiling: bool = False, start_time: str = None): +# """Initialize and return the profiler context manager instance.""" + +# if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0: +# llm_profile = torch.profiler.profile +# logger.info(f"Do profiling in rank {gpc.get_global_rank()}!") +# else: +# llm_profile = DummyProfile + +# return llm_profile( +# activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], +# schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1), +# on_trace_ready=torch.profiler.tensorboard_trace_handler( +# f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" +# + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" +# + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_" +# + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}", +# ), +# with_stack=True, +# with_modules=True, +# profile_memory=True, +# ) + + def initialize_llm_profile(profiling: bool = False, start_time: str = None): """Initialize and return the profiler context manager instance.""" @@ -405,8 +429,8 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): on_trace_ready=torch.profiler.tensorboard_trace_handler( f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" - + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_" - + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}", + + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_" + + f"sp{gpc.get_local_rank(ParallelMode.SEQUENCE)}", ), with_stack=True, with_modules=True, From 6cd271c3bc6625ea119533911857878d2af18436 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 1 Dec 2023 10:52:01 +0800 Subject: [PATCH 080/153] fix(model): fix process group error --- internlm/model/embedding.py | 8 ++++++-- internlm/model/loss.py | 2 +- internlm/model/modeling_internlm.py | 20 ++++++++++++++++++- internlm/model/multi_head_attention.py | 9 +++++++++ .../solver/optimizer/hybrid_zero_optim2.py | 13 +++++++++--- internlm/train/training_internlm.py | 1 + sort_log.py | 17 ++++++++++++++++ 7 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 sort_log.py diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py index 4be47d64..225a5f16 100644 --- a/internlm/model/embedding.py +++ b/internlm/model/embedding.py @@ -44,7 +44,7 @@ def __init__( self.num_embeddings = num_embeddings self.embed_dim = embedding_dim - embed_dim_per_partition = embedding_dim // gpc.weight_parallel_size + embed_dim_per_partition = embedding_dim // gpc.sequence_parallel_size self.padding_idx = padding_idx self.embed_args = args @@ -55,10 +55,14 @@ def __init__( def forward(self, input_: Tensor) -> Tensor: output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) - output = gather_forward_split_backward(output_parallel, ParallelMode.WEIGHT, dim=-1) + output = gather_forward_split_backward(output_parallel, ParallelMode.SEQUENCE, dim=-1) if gpc.config.parallel.sequence > 1: output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1) + # print( + # f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}", + # flush=True, + # ) return output diff --git a/internlm/model/loss.py b/internlm/model/loss.py index ac92b4b9..a634d2c7 100644 --- a/internlm/model/loss.py +++ b/internlm/model/loss.py @@ -28,7 +28,7 @@ def __init__(self, parallel_output=True, label_smoothing=0): self.loss_fn = FlashCrossEntropyLoss( reduction="mean", inplace_backward=True, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(ParallelMode.SEQUENCE), label_smoothing=label_smoothing, ) # The loss in this place is bound to the gather_output initialized by VocabParallelClassifier1D else: diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 1797f677..a937a3f4 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -149,9 +149,13 @@ def __init__( for param in self.norm1.parameters(): if gpc.config.parallel.sequence_parallel is True: setattr(param, IS_SEQUENCE_PARALLEL, True) + if gpc.config.parallel.weight.size > 1: + setattr(param, IS_SEQUENCE_PARALLEL, True) for param in self.norm2.parameters(): if gpc.config.parallel.sequence_parallel is True: setattr(param, IS_SEQUENCE_PARALLEL, True) + if gpc.config.parallel.weight.size > 1: + setattr(param, IS_SEQUENCE_PARALLEL, True) self.dropout2 = nn.Dropout(drop_rate) self.use_swiglu = use_swiglu @@ -240,7 +244,14 @@ def _dropout_and_norm_ffn(_residual, _hidden_states): if self.residual_in_fp32: residual = residual.to(torch.float32) + # print( + # f"ht debug mlp rank:{gpc.get_global_rank()} input.shape:{hidden_states.shape} input:{hidden_states}", + # flush=True, + # ) hidden_states = self.mlp(hidden_states) + # print( + # f"ht debug mlp rank:{gpc.get_global_rank()} out.shape:{hidden_states.shape} out:{hidden_states}", flush=True + # ) return hidden_states + residual @@ -376,7 +387,7 @@ def __init__( self.head = head_cls( in_features=hidden_size, out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size, - process_group=gpc.get_group(ParallelMode.WEIGHT), + process_group=gpc.get_group(ParallelMode.SEQUENCE), bias=False, device=device, dtype=dtype, @@ -391,6 +402,8 @@ def __init__( for param in self.norm.parameters(): if gpc.config.parallel.sequence_parallel is True: setattr(param, IS_SEQUENCE_PARALLEL, True) + if gpc.config.parallel.weight.size > 1: + setattr(param, IS_SEQUENCE_PARALLEL, True) self.parallel_output = parallel_output @@ -441,6 +454,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N else: # Training hidden_states = self.head(hidden_states, gather_dim=0) + # print( + # f"ht debug head rank:{gpc.get_global_rank()} hidden_states.shape:{hidden_states.shape} hidden_states:{hidden_states}", + # flush=True, + # ) + if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) return hidden_states diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index d06cd967..77b05c6f 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -445,9 +445,11 @@ def _packed_forward(self, x, inference_params=None, **kwargs): split x during sequence parallel, we split the batch * seqlen dimension (in case batch is small). """ + # print(f"ht debug mha rank:{gpc.get_global_rank()} wqkv.shape:{self.Wqkv.weight.shape} wqkv:{self.Wqkv.weight}") qkv = self.Wqkv(x) # total x hsz' qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim) # total x 3 x n_head x d qkv = self.rotary_emb(qkv, **kwargs) + # print(f"ht debug mha rank:{gpc.get_global_rank()} qkv.shape:{qkv.shape} qkv:{qkv}", flush=True) kwargs.pop("indexes") if inference_params is None: if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn: @@ -462,5 +464,12 @@ def _packed_forward(self, x, inference_params=None, **kwargs): raise RuntimeError("Not support this right now") context = rearrange(context, "b h d -> b (h d)") # recover the shape + # print(f"ht debug mha rank:{gpc.get_global_rank()} context.shape:{context.shape} context:{context}") + # print( + # f"ht debug mha rank:{gpc.get_global_rank()} out_proj.shape:{self.out_proj.weight.shape} out_proj:{self.out_proj.weight}" + # ) out = self.out_proj(context) + + # print(f"ht debug mha rank:{gpc.get_global_rank()} out.shape:{out.shape} out:{out}") + return out diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py index e2b3995a..fd34b265 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim2.py +++ b/internlm/solver/optimizer/hybrid_zero_optim2.py @@ -131,7 +131,7 @@ def __init__( if self._overlap_sync_param: assert self._param_bcast_sync_handler is not None - if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True: + if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None @@ -314,7 +314,7 @@ def reduction_sp_func(): param.grad, dtype=None, dst_rank=reduce_rank, - parallel_mode=ParallelMode.GLOBAL, + parallel_mode=ParallelMode.WEIGHT, ) handle.wait() @@ -341,8 +341,15 @@ def reduce_grad_hook_sp(*args): # pylint: disable=W0613 # if sequence_parallel is True, # the grad of norm should be all-reduce across the tp process group + # if ( + # gpc.config.parallel.sequence_parallel is True + # and hasattr(param, IS_SEQUENCE_PARALLEL) + # and getattr(param, IS_SEQUENCE_PARALLEL) is True + # ): + # accum_grad_obj.register_hook(reduce_grad_hook_sp) + if ( - gpc.config.parallel.sequence_parallel is True + gpc.config.parallel.weight.size > 1 and hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True ): diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 9658baec..d48d99c3 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -192,6 +192,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): zero_cfg=gpc.config.hybrid_zero_optimizer, param_bcast_sync_handler=param_bcast_sync_handler, ) + logger.info("use HybridZeroOptimizer2 for new partition strategy...") elif not gpc.config.parallel.zero1.fsdp: optimizer = HybridZeroOptimizer( naive_optimizer, diff --git a/sort_log.py b/sort_log.py new file mode 100644 index 00000000..786c2282 --- /dev/null +++ b/sort_log.py @@ -0,0 +1,17 @@ +import re + +# 读取日志信息 +with open("ht.log", "r") as file: + log_content = file.read() + +# 使用正则表达式提取以 "ht debug" 开头、以 "dtype=***" 结尾的日志信息块 +log_blocks = re.findall(r"ht debug.*?device=[^\n]*", log_content, re.DOTALL) + +# 将日志信息块按照 "rank:" 后的整数值进行正序排序 +sorted_log_blocks = sorted(log_blocks, key=lambda x: int(re.search(r"rank:(\d+)", x).group(1))) + +# 将排序后的日志信息块写入新的文件 +with open("sorted.log", "w") as file: + file.write("\n\n".join(sorted_log_blocks)) + +print("日志信息块已按照 rank: 后的整数值进行正序排序,并保存在 sorted_log_blocks.txt 文件中。") From 0817b8cf204aef1c0f0b3ec74cbc59acbf3f93a6 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 1 Dec 2023 15:18:49 +0800 Subject: [PATCH 081/153] fix(model): fix FSTP linear Torch process group --- internlm/model/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 89980c07..04fa0efe 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -657,7 +657,7 @@ def backward(ctx, grad_output, *args): batch_dim = batch_shape.numel() grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - world_size = gpc.get_world_size(ParallelMode.TENSOR) + world_size = gpc.get_world_size(ParallelMode.WEIGHT) if world_size > 1: if overlap_handler is not None: total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) From 1b7d2dc455ef2438ba084e97cc0b9f4ec658c965 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 7 Dec 2023 10:25:32 +0800 Subject: [PATCH 082/153] fix(overlap_handler.py): release module post backward when model ckpt is --- internlm/core/naive_amp.py | 2 +- internlm/model/overlap_handler.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/internlm/core/naive_amp.py b/internlm/core/naive_amp.py index 9bead52f..fb04759b 100644 --- a/internlm/core/naive_amp.py +++ b/internlm/core/naive_amp.py @@ -81,7 +81,7 @@ def _convert_to_fp16(self, input_: Any): def _convert_to_fp32(self, input_: Any): """Converts the input to fp32 if it is a Tensor of dtype float16.""" - if isinstance(input_, Tensor) and input_.dtype == torch.float16: + if isinstance(input_, Tensor) and input_.dtype in (torch.float16, torch.bfloat16): input_ = input_.float() return input_ diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 086947f3..c81b09d0 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -353,7 +353,9 @@ def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: for module in self.fstp_modules: module.register_full_backward_pre_hook(_pre_backward_hook_for_module) - module.register_full_backward_hook(_post_backward_hook_for_module) + + for module in self.fstp_modules: + module.register_full_backward_hook(_post_backward_hook_for_module) class FSTPOverlapSchedulerHook(SchedulerHook): From fd5a144724a19b00636178edc2792bc30b6d7ef7 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 11 Dec 2023 17:17:35 +0800 Subject: [PATCH 083/153] feat(model): embedding and head use sp group and refactor parameter group --- internlm/core/context/__init__.py | 4 +- internlm/core/context/parallel_context.py | 4 +- internlm/model/modeling_internlm.py | 58 +++++++++------ internlm/model/multi_head_attention.py | 12 +-- .../solver/optimizer/hybrid_zero_optim2.py | 74 +++++++++++++------ internlm/solver/optimizer/store.py | 4 +- internlm/solver/optimizer/utils.py | 45 +++++++---- internlm/train/training_internlm.py | 4 +- internlm/train/utils.py | 67 ++++++++++++++++- internlm/utils/parallel.py | 39 ++++++---- 10 files changed, 224 insertions(+), 87 deletions(-) diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index e17b4ba3..f62d6a90 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -1,7 +1,9 @@ from .parallel_context import ( IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, - IS_WEIGHT_PARALLEL, + IS_REPLICA_ZERO_PARALLEL, + IS_SEQUENCE_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, Config, ParallelContext, global_context, diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index e25b3de6..c2fc574d 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -26,7 +26,9 @@ IS_TENSOR_PARALLEL = "is_tensor_parallel" IS_SEQUENCE_PARALLEL = "is_sequence_parallel" -IS_WEIGHT_PARALLEL = "is_weight_parallel" +IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel" +IS_SEQUENCE_DATA_PARALLEL = "is_sequence_data_parallel" +IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel" logger = get_logger(__file__) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index a937a3f4..400ad273 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -9,7 +9,14 @@ from flash_attn.modules.mlp import ParallelFusedMLP from torch import nn -from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode +from internlm.core.context import ( + IS_SEQUENCE_PARALLEL, + IS_TENSOR_PARALLEL, + IS_REPLICA_ZERO_PARALLEL, + IS_SEQUENCE_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + ParallelMode, +) from internlm.core.context.parallel_context import global_context as gpc from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal from internlm.model.embedding import Embedding1D @@ -142,20 +149,22 @@ def __init__( dtype=dtype, ) for _, param in self.mlp.named_parameters(): - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) + # if gpc.get_world_size(ParallelMode.TENSOR) > 1: + # setattr(param, IS_TENSOR_PARALLEL, True) if gpc.get_world_size(ParallelMode.WEIGHT) > 1: - setattr(param, IS_WEIGHT_PARALLEL, True) + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) for param in self.norm1.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) - if gpc.config.parallel.weight.size > 1: - setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.sequence_parallel is True: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.weight.size > 1: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + setattr(param, IS_REPLICA_ZERO_PARALLEL, True) for param in self.norm2.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) - if gpc.config.parallel.weight.size > 1: - setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.sequence_parallel is True: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.weight.size > 1: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + setattr(param, IS_REPLICA_ZERO_PARALLEL, True) self.dropout2 = nn.Dropout(drop_rate) self.use_swiglu = use_swiglu @@ -348,10 +357,10 @@ def __init__( ) for _, param in self.embedding.named_parameters(): normal_(std=0.0052)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.WEIGHT) > 1: - setattr(param, IS_WEIGHT_PARALLEL, True) + # if gpc.get_world_size(ParallelMode.TENSOR) > 1: + # setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.SEQUENCE) > 1: + setattr(param, IS_SEQUENCE_DATA_PARALLEL, True) self.embed_grad_scale = embed_grad_scale self.blocks = nn.ModuleList( [ @@ -395,15 +404,16 @@ def __init__( ) for _, param in self.head.named_parameters(): normal_(std=0.0052)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.WEIGHT) > 1: - setattr(param, IS_WEIGHT_PARALLEL, True) + # if gpc.get_world_size(ParallelMode.TENSOR) > 1: + # setattr(param, IS_TENSOR_PARALLEL, True) + if gpc.get_world_size(ParallelMode.SEQUENCE) > 1: + setattr(param, IS_SEQUENCE_DATA_PARALLEL, True) for param in self.norm.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) - if gpc.config.parallel.weight.size > 1: - setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.sequence_parallel is True: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + # if gpc.config.parallel.weight.size > 1: + # setattr(param, IS_SEQUENCE_PARALLEL, True) + setattr(param, IS_REPLICA_ZERO_PARALLEL, True) self.parallel_output = parallel_output diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 77b05c6f..5d9e0a40 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -10,7 +10,7 @@ import torch.nn.functional as F from einops import rearrange -from internlm.core.context import IS_WEIGHT_PARALLEL +from internlm.core.context import IS_WEIGHT_ZERO_PARALLEL try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func @@ -235,14 +235,14 @@ def __init__( **factory_kwargs, ) # need to assign tp attribute so that internlm know it is tensor parallel module - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - for name in ["out_proj", "Wqkv"]: - for param in getattr(self, name).parameters(): - setattr(param, IS_TENSOR_PARALLEL, True) + # if gpc.get_world_size(ParallelMode.TENSOR) > 1: + # for name in ["out_proj", "Wqkv"]: + # for param in getattr(self, name).parameters(): + # setattr(param, IS_TENSOR_PARALLEL, True) if gpc.get_world_size(ParallelMode.WEIGHT) > 1: for name in ["out_proj", "Wqkv"]: for param in getattr(self, name).parameters(): - setattr(param, IS_WEIGHT_PARALLEL, True) + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) def forward(self, x, seqlen=None, inference_params=None, **kwargs): if kwargs.get("indexes", None) is not None: diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py index fd34b265..3bbf2678 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim2.py +++ b/internlm/solver/optimizer/hybrid_zero_optim2.py @@ -9,8 +9,9 @@ import torch.distributed as dist from torch.optim import Optimizer -from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode +from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode from internlm.core.context import global_context as gpc +from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -150,17 +151,24 @@ def __init__( # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1 # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode - zero_mode = ( - ParallelMode.ZERO1 - if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA - else ParallelMode.EXPERT_DATA - ) + # zero_mode = ( + # ParallelMode.ZERO1 + # if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA + # else ParallelMode.EXPERT_DATA + # ) + zero_mode = param_group["optimizer_mode"] + self._zero_local_rank.append(gpc.get_local_rank(zero_mode)) self._zero_world_size.append(gpc.get_world_size(zero_mode)) # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name self._broadcast_parallel_mode.append(zero_mode) - self._bucket_store.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA)) - self._accum_grad_buckets.append(BucketStore(group_id, ParallelMode.WEIGHT_DATA)) + + grad_reduce_mode = ParallelMode.WEIGHT_DATA + if param_group["name"] == "embed_head": + grad_reduce_mode = ParallelMode.DATA + + self._bucket_store.append(BucketStore(group_id, grad_reduce_mode)) + self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode)) # assign parameters to ranks the params in the list are sorted params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) @@ -184,15 +192,26 @@ def __init__( param.data = param.data.cpu() # flatten the reordered tensors - for rank in range(self._zero_world_size[group_id]): - # No flat fp16 buffer is allocated if the process has no parameters. - if rank not in self.param_group_no_params_ranks[group_id]: - tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) - with torch.no_grad(): - flat_tensor = flatten(tensor_list) - flat_tensor = flat_tensor.data.cuda() - self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) - sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) + if param_group["name"] == "embed_head": + tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) + with torch.no_grad(): + flat_tensor = flatten(tensor_list) + flat_tensor = flat_tensor.data.cuda() + sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) + # for rank in range(self._zero_world_size[group_id]): + self._param_store.add_flat_fp16_param_by_rank_group( + self._zero_local_rank[group_id], group_id, flat_tensor + ) + else: + for rank in range(self._zero_world_size[group_id]): + # No flat fp16 buffer is allocated if the process has no parameters. + if rank not in self.param_group_no_params_ranks[group_id]: + tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) + with torch.no_grad(): + flat_tensor = flatten(tensor_list) + flat_tensor = flat_tensor.data.cuda() + self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) + sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) # create a copy of fp32 weights of the parameters for which this rank is responsible # No flat fp32 buffer is allocated if the process has no parameters. @@ -222,8 +241,6 @@ def __init__( # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled. self.skip_grad_reduce = False - # reduction hook is only used if overlapping communication - # if it is stage 1 without overlapping, no hook will be attached self._attach_reduction_hook() @property @@ -244,6 +261,10 @@ def num_param_groups(self): def _partition_param_list(self, group_id, param_group): no_params_ranks = [] + if param_group["name"] == "embed_head": + params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])] + return params_per_rank, set(no_params_ranks) + params_per_rank = [[] for _ in range(self._zero_world_size[group_id])] numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])] self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])]) @@ -350,8 +371,8 @@ def reduce_grad_hook_sp(*args): # pylint: disable=W0613 if ( gpc.config.parallel.weight.size > 1 - and hasattr(param, IS_SEQUENCE_PARALLEL) - and getattr(param, IS_SEQUENCE_PARALLEL) is True + and hasattr(param, IS_REPLICA_ZERO_PARALLEL) + and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True ): accum_grad_obj.register_hook(reduce_grad_hook_sp) @@ -382,9 +403,9 @@ def belongs_to_current_rank(self, param) -> bool: :return: True if the parameter should be updated by the current rank. Otherwise false. :rtype: bool """ - tensor_rank = self._param_store.get_param_rank(param) + tensor_ranks = self._param_store.get_param_rank(param) group_id = getattr(param, "group_id") - return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) + return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: for _param in bucket.get_param(reduce_rank): @@ -654,6 +675,11 @@ def step(self, closure=None): """ assert closure is None, "closure is not supported by step()" + # import pdb + + # if gpc.get_global_rank() == 0: + # pdb.set_trace() + # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients if not self._overlap_sync_grad: @@ -859,6 +885,8 @@ def broadcast_params(self): handles = [] for group_id in range(self.num_param_groups): + if self.param_groups[group_id]["name"] == "embed_head": + continue for rank in range(self._zero_world_size[group_id]): # The following operations are performed only on the rank to which parameters are assigned. if rank in self.param_group_no_params_ranks[group_id]: diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py index f486ccec..c42f1a56 100644 --- a/internlm/solver/optimizer/store.py +++ b/internlm/solver/optimizer/store.py @@ -177,8 +177,10 @@ def set_param_to_rank(self, tensor: Tensor, rank: int) -> None: :param rank: The rank of which the process is responsible for updating the parameter :type rank: int """ + if tensor not in self._fp16_param_to_rank: + self._fp16_param_to_rank[tensor] = [] - self._fp16_param_to_rank[tensor] = rank + self._fp16_param_to_rank[tensor].append(rank) def get_param_rank(self, tensor: Tensor) -> int: """ diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index b2ea0391..223fddf1 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -14,10 +14,20 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc +from internlm.core.context.parallel_context import ( + IS_REPLICA_ZERO_PARALLEL, + IS_SEQUENCE_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, +) from internlm.core.naive_amp import NaiveAMPModel from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda from internlm.utils.logger import get_logger -from internlm.utils.parallel import is_model_parallel_parameter, is_weight_parallel_parameter +from internlm.utils.parallel import ( + is_model_parallel_parameter, + is_replica_zero_parallel_parameter, + is_sequence_data_parallel_parameter, + is_weight_zero_parallel_parameter, +) logger = get_logger(__file__) @@ -244,12 +254,14 @@ def append_grad(g, p): ): # if not used in each chunk, such as layernorm append_grad(g, p) elif ( - gpc.is_initialized(ParallelMode.WEIGHT) - and not is_weight_parallel_parameter(p) - and gpc.get_local_rank(ParallelMode.WEIGHT) == 0 - ): # if not used in each chunk, such as layernorm + is_replica_zero_parallel_parameter(p) and gpc.get_global_rank(ParallelMode.GLOBAL) == 0 + ): # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group + append_grad(g, p) + elif gpc.is_initialized(ParallelMode.SEQUENCE) and is_sequence_data_parallel_parameter(p): + # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group append_grad(g, p) - elif is_weight_parallel_parameter(p): + elif gpc.is_initialized(ParallelMode.WEIGHT) and is_weight_zero_parallel_parameter(p): + # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group append_grad(g, p) elif is_model_parallel_parameter(p): append_grad(g, p) @@ -320,16 +332,23 @@ def compute_norm( total_norm = total_norm + previous_norm # Sum across all model-parallel GPUs. - if gpc.is_initialized(ParallelMode.WEIGHT): - dist.all_reduce( - total_norm, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.WEIGHT), - ) + if hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL): + if gpc.is_initialized(ParallelMode.WEIGHT): + dist.all_reduce( + total_norm, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.WEIGHT), + ) # This is because we use zero1, so we need to use this reduction. # TODO: Check zero group to be a subset of dp group. - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) + if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or ( + hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) + ): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) + + if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE)) if torch.is_tensor(total_norm): total_norm = total_norm.item() diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index d48d99c3..8d786489 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -58,7 +58,7 @@ set_model_params_layer_name, sync_model_param, sync_model_param_within_tp, - sync_model_param_within_wp, + sync_model_replica_param_group, ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout @@ -107,7 +107,7 @@ def initialize_model(): # the same across tensor parallelism. sync_model_param_within_tp(model) - sync_model_param_within_wp(model) + sync_model_replica_param_group(model) # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 9096a2a4..0e866eb5 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -2,7 +2,7 @@ import torch -from internlm.core.context.parallel_context import ParallelMode +from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL, ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param @@ -81,6 +81,69 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) return tuple(param_groups) +def split_params_into_different_groups_for_optimizer_with_new_partition_strategy( + param_groups: Tuple[Dict], +) -> Tuple[Dict]: + """Split parameters into different groups for optimizer + + Args: + param_groups (Tuple[Dict]): The list of parameter groups to split + Input Example: + >>> ( + >>> {'name': 'default', 'params': [tensor], 'weight_decay' :xxx}, + >>> ) + + Returns: + Tuple[Dict]: list of params groups for optimizer + Output Example: + >>> ( + >>> {'name': 'default','params': [tensor],'weight_decay' :xxx}, + >>> {'name': 'embed_head', 'params': [tensor],'weight_decay' :xxx}, + >>> ) + """ + + if isinstance(param_groups, tuple): + param_groups = list(param_groups) # Tuple cannot be modified + elif isinstance(param_groups, dict): + param_groups = [param_groups] + elif not isinstance(param_groups, list): + raise ValueError(f"Unknown param group type of {type(param_groups)}") + + # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True) + + # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group + new_groups = {} + new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} + + for pgroup in param_groups: + # copy attribute from origin group, we assume the input param_groups only + # have one group, so the attribute will not be copyed multiple times. + for ori_key in pgroup.keys(): + if ori_key not in ("name", "params"): + for _, group in new_groups.items(): + group[ori_key] = pgroup[ori_key] + # assign param + origin_params = [] + for param in pgroup["params"]: + if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True: + new_groups["embed_head"]["params"].append(param) + else: + origin_params.append(param) + + # default param group, which is the first group in the param groups + pgroup["params"] = origin_params + pgroup["optimizer_mode"] = ParallelMode.ZERO1 + + # param groups may contain empty groups, such as fp32 + param_groups.extend(new_groups.values()) + + # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) + # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) + + return tuple(param_groups) + + def create_param_groups(model, weight_decay): parameters = {"params": list(model.parameters()), "name": "default", "weight_decay": weight_decay} - return split_params_into_different_groups_for_optimizer(parameters) + # return split_params_into_different_groups_for_optimizer(parameters) + return split_params_into_different_groups_for_optimizer_with_new_partition_strategy(parameters) diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 3399491c..2b421b07 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -4,7 +4,13 @@ import torch.distributed as dist from torch import nn -from internlm.core.context import IS_TENSOR_PARALLEL, IS_WEIGHT_PARALLEL, ParallelMode +from internlm.core.context import ( + IS_TENSOR_PARALLEL, + IS_REPLICA_ZERO_PARALLEL, + IS_SEQUENCE_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + ParallelMode, +) from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel @@ -13,8 +19,16 @@ def is_model_parallel_parameter(p): return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL) -def is_weight_parallel_parameter(p): - return hasattr(p, IS_WEIGHT_PARALLEL) and getattr(p, IS_WEIGHT_PARALLEL) +def is_replica_zero_parallel_parameter(p): + return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL) + + +def is_sequence_data_parallel_parameter(p): + return hasattr(p, IS_SEQUENCE_DATA_PARALLEL) and getattr(p, IS_SEQUENCE_DATA_PARALLEL) + + +def is_weight_zero_parallel_parameter(p): + return hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL) def sync_model_param(model): @@ -56,24 +70,21 @@ def sync_model_param_within_tp(model): dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) -def sync_model_param_within_wp(model): +def sync_model_replica_param_group(model): r"""This function is changed from colossalai, which is ``sync_model_param``. - We modified this function to make sure it only sync parameters within tensor parallelism - but they are not splitted by tensor parallelism. - This function is used to make sure parameters that are not splitted by tensor parallelism - are the same across each tensor parallelism. + We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in world size. + This function is used to make sure parameters that are not splitted are the same across each rank. For example, parameters like RMSNorm, LayerNorm... Args: model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - parallel_mode = ParallelMode.WEIGHT - if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1: - for param in model.parameters(): - if not is_weight_parallel_parameter(param): - ranks = gpc.get_ranks_in_group(parallel_mode) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) + + for param in model.parameters(): + if is_replica_zero_parallel_parameter(param): + ranks = gpc.get_ranks_in_group(ParallelMode.GLOBAL) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.GLOBAL)) def get_parallel_log_file_name(): From ac72710a68c05d2c1f23af44f1d9147dedab034e Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 12 Dec 2023 11:30:08 +0800 Subject: [PATCH 084/153] feat(model): modify grad norm compute func --- .../solver/optimizer/hybrid_zero_optim2.py | 50 +++++++++---------- internlm/solver/optimizer/utils.py | 27 ++++++---- internlm/train/utils.py | 6 ++- internlm/utils/parallel.py | 12 ++++- 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py index 3bbf2678..fbfa20cd 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim2.py +++ b/internlm/solver/optimizer/hybrid_zero_optim2.py @@ -192,26 +192,26 @@ def __init__( param.data = param.data.cpu() # flatten the reordered tensors - if param_group["name"] == "embed_head": - tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) - with torch.no_grad(): - flat_tensor = flatten(tensor_list) - flat_tensor = flat_tensor.data.cuda() - sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) - # for rank in range(self._zero_world_size[group_id]): - self._param_store.add_flat_fp16_param_by_rank_group( - self._zero_local_rank[group_id], group_id, flat_tensor - ) - else: - for rank in range(self._zero_world_size[group_id]): - # No flat fp16 buffer is allocated if the process has no parameters. - if rank not in self.param_group_no_params_ranks[group_id]: - tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) - with torch.no_grad(): - flat_tensor = flatten(tensor_list) - flat_tensor = flat_tensor.data.cuda() - self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) - sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) + # if param_group["name"] == "embed_head": + # tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) + # with torch.no_grad(): + # flat_tensor = flatten(tensor_list) + # flat_tensor = flat_tensor.data.cuda() + # sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) + # # for rank in range(self._zero_world_size[group_id]): + # self._param_store.add_flat_fp16_param_by_rank_group( + # self._zero_local_rank[group_id], group_id, flat_tensor + # ) + # else: + for rank in range(self._zero_world_size[group_id]): + # No flat fp16 buffer is allocated if the process has no parameters. + if rank not in self.param_group_no_params_ranks[group_id]: + tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) + with torch.no_grad(): + flat_tensor = flatten(tensor_list) + flat_tensor = flat_tensor.data.cuda() + self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) + sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) # create a copy of fp32 weights of the parameters for which this rank is responsible # No flat fp32 buffer is allocated if the process has no parameters. @@ -261,9 +261,9 @@ def num_param_groups(self): def _partition_param_list(self, group_id, param_group): no_params_ranks = [] - if param_group["name"] == "embed_head": - params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])] - return params_per_rank, set(no_params_ranks) + # if param_group["name"] == "embed_head": + # params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])] + # return params_per_rank, set(no_params_ranks) params_per_rank = [[] for _ in range(self._zero_world_size[group_id])] numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])] @@ -885,8 +885,8 @@ def broadcast_params(self): handles = [] for group_id in range(self.num_param_groups): - if self.param_groups[group_id]["name"] == "embed_head": - continue + # if self.param_groups[group_id]["name"] == "embed_head": + # continue for rank in range(self._zero_world_size[group_id]): # The following operations are performed only on the rank to which parameters are assigned. if rank in self.param_group_no_params_ranks[group_id]: diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 223fddf1..7e760b85 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -254,13 +254,13 @@ def append_grad(g, p): ): # if not used in each chunk, such as layernorm append_grad(g, p) elif ( - is_replica_zero_parallel_parameter(p) and gpc.get_global_rank(ParallelMode.GLOBAL) == 0 + is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(ParallelMode.WEIGHT) == 0 ): # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group append_grad(g, p) - elif gpc.is_initialized(ParallelMode.SEQUENCE) and is_sequence_data_parallel_parameter(p): + elif is_sequence_data_parallel_parameter(p): # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group append_grad(g, p) - elif gpc.is_initialized(ParallelMode.WEIGHT) and is_weight_zero_parallel_parameter(p): + elif is_weight_zero_parallel_parameter(p): # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group append_grad(g, p) elif is_model_parallel_parameter(p): @@ -332,23 +332,28 @@ def compute_norm( total_norm = total_norm + previous_norm # Sum across all model-parallel GPUs. - if hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL): + if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE)) + else: if gpc.is_initialized(ParallelMode.WEIGHT): dist.all_reduce( total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.WEIGHT), ) + if gpc.is_initialized(ParallelMode.PIPELINE): + dist.all_reduce( + total_norm, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.PIPELINE), + ) # This is because we use zero1, so we need to use this reduction. # TODO: Check zero group to be a subset of dp group. - if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or ( - hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) - ): - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) - - if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL): - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE)) + # if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or ( + # hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) + # ): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) if torch.is_tensor(total_norm): total_norm = total_norm.item() diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 0e866eb5..382c46d5 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -2,7 +2,7 @@ import torch -from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL, ParallelMode +from internlm.core.context.parallel_context import IS_REPLICA_ZERO_PARALLEL, IS_SEQUENCE_DATA_PARALLEL, ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param @@ -114,6 +114,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group new_groups = {} new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} + # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1} for pgroup in param_groups: # copy attribute from origin group, we assume the input param_groups only @@ -127,6 +128,8 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy for param in pgroup["params"]: if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True: new_groups["embed_head"]["params"].append(param) + # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True: + # new_groups["layer_norm"]["params"].append(param) else: origin_params.append(param) @@ -139,6 +142,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) + # print(f"ht debug params_groups after split layer_norm len:{len(param_groups[2]['params'])}", flush=True) return tuple(param_groups) diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 2b421b07..966332a1 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -24,11 +24,19 @@ def is_replica_zero_parallel_parameter(p): def is_sequence_data_parallel_parameter(p): - return hasattr(p, IS_SEQUENCE_DATA_PARALLEL) and getattr(p, IS_SEQUENCE_DATA_PARALLEL) + return ( + gpc.is_initialized(ParallelMode.SEQUENCE) + and hasattr(p, IS_SEQUENCE_DATA_PARALLEL) + and getattr(p, IS_SEQUENCE_DATA_PARALLEL) + ) def is_weight_zero_parallel_parameter(p): - return hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL) + return ( + gpc.is_initialized(ParallelMode.WEIGHT) + and hasattr(p, IS_WEIGHT_ZERO_PARALLEL) + and getattr(p, IS_WEIGHT_ZERO_PARALLEL) + ) def sync_model_param(model): From 76be8c26534f1cf18e3670f76d3f7053d3037fd8 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 14 Dec 2023 14:53:40 +0800 Subject: [PATCH 085/153] fix(model/utils.py): fix fstp linear reduce scatter sum->avg --- internlm/model/utils.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 04fa0efe..a4fe3378 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -162,19 +162,23 @@ def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): return grad_weight, grad_bias -def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): +def reduce_scatter_raw( + input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False +): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 output = torch.empty( input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device ).contiguous() handle = torch.distributed.reduce_scatter_tensor( - output, input_.contiguous(), group=process_group, async_op=async_op + output, input_.contiguous(), op=op, group=process_group, async_op=async_op ) return output, handle -def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): +def reduce_scatter_raw_memory_pool( + input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False +): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 if gpc.fstp_handler.enable_memory_pool: @@ -185,7 +189,7 @@ def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device ).contiguous() handle = torch.distributed.reduce_scatter_tensor( - output, input_.contiguous(), group=process_group, async_op=async_op + output, input_.contiguous(), op=op, group=process_group, async_op=async_op ) return output, handle @@ -575,7 +579,7 @@ def backward(ctx, grad_output, *args): if world_size > 1: if overlap_handler is not None: grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( - grad_weight, process_group, async_op=True + grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True ) assert hasattr(weight, "_fstp_reduce_scatter_str") overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( @@ -592,7 +596,7 @@ def backward(ctx, grad_output, *args): ) if grad_bias is not None: grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( - grad_bias, process_group, async_op=True + grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True ) assert hasattr(bias, "_fstp_reduce_scatter_str") overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( @@ -608,9 +612,13 @@ def backward(ctx, grad_output, *args): device=grad_bias.device, ) else: - grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + grad_weight, handle_grad_weight = reduce_scatter_raw( + grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True + ) if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + grad_bias, handle_grad_bias = reduce_scatter_raw( + grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True + ) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -676,7 +684,7 @@ def backward(ctx, grad_output, *args): if world_size > 1: if overlap_handler is not None: grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( - grad_weight, process_group, async_op=True + grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True ) assert hasattr(weight, "_fstp_reduce_scatter_str") overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( @@ -693,7 +701,7 @@ def backward(ctx, grad_output, *args): ) if grad_bias is not None: grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( - grad_bias, process_group, async_op=True + grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True ) assert hasattr(bias, "_fstp_reduce_scatter_str") overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( @@ -709,9 +717,13 @@ def backward(ctx, grad_output, *args): device=grad_bias.device, ) else: - grad_weight, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True) + grad_weight, handle_grad_weight = reduce_scatter_raw( + grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True + ) if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True) + grad_bias, handle_grad_bias = reduce_scatter_raw( + grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True + ) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None From d30aecddbc0401e37eacd3f5346ba6456275948f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 19 Dec 2023 16:36:10 +0800 Subject: [PATCH 086/153] feat(core/context): support pp for initializing isp/msp/fsp process group --- configs/7B_sft.py | 20 +- internlm/core/context/__init__.py | 2 - internlm/core/context/parallel_context.py | 31 +- .../core/context/process_group_initializer.py | 479 ++++++++++-------- internlm/initialize/launch.py | 35 +- train.py | 28 + 6 files changed, 348 insertions(+), 247 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 3c491660..822bcb52 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -152,21 +152,25 @@ 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. tensor parallel (dict): 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. + 2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'], + defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel. + msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size. + fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size. + isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel. pipeline parallel (dict): 1. size: int, the size of pipeline parallel. 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, defaults to False. +weight parallel (dict): + 1. size: int, the size of weight parallel. + 2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False. + 3. memory_pool: bool, enable/disable memory pool, defaults to False. """ parallel = dict( zero1=dict(size=2, fsdp=False), - tensor=dict(size=1, sp="intern", intern_overlap=False, memory_pool=False), - pipeline=dict(size=1, interleaved_overlap=True), - weight=dict(size=8, overlap=True, memory_pool=True), - sequence=4, + tensor=dict(size=4, mode="mtp"), + pipeline=dict(size=2, interleaved_overlap=True), + weight=dict(size=1, overlap=True, memory_pool=True), ) cudnn_deterministic = False diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index f62d6a90..5382837e 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -10,7 +10,6 @@ ) from .process_group_initializer import ( Initializer_Data, - Initializer_Model, Initializer_Nettest, Initializer_Pipeline, Initializer_Tensor, @@ -44,7 +43,6 @@ "Initializer_Nettest", "Initializer_Zero3_dp", "ProcessGroupInitializer", - "Initializer_Model", "seed", "set_mode", "add_seed", diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index c2fc574d..538d3947 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -476,18 +476,24 @@ def init_parallel_groups(self): parallel_config = self.config.get("parallel", None) if parallel_config is not None: self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size") - self._set_parallel_size_from_config(parallel_config, "sequence", "sequence_parallel_size") - self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size") self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size") + self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size") self._set_parallel_size_from_config(parallel_config, "zero1", "zero1_parallel_size") # the user should not set the data parallel size manually # instead, it should be calculated based on other parallel config - assert self.tensor_parallel_size == 1 - assert self.pipeline_parallel_size == 1 assert self.zero1_parallel_size >= 1 - self.data_parallel_size = self.world_size // self.sequence_parallel_size - self.weight_data_parallel_size = self.world_size // self.weight_parallel_size + self.sequence_parallel_size = self.tensor_parallel_size + self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size + self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size + if parallel_config["tensor"]["mode"] != "isp": + assert ( + self.zero1_parallel_size <= self.data_parallel_size + ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" + else: + assert ( + self.zero1_parallel_size <= self.weight_data_parallel_size + ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}" # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 @@ -508,6 +514,7 @@ def init_parallel_groups(self): rank, world_size, self.weight_parallel_size, + self.weight_data_parallel_size, self.sequence_parallel_size, self.data_parallel_size, self.pipeline_parallel_size, @@ -520,12 +527,16 @@ def init_parallel_groups(self): # run initialization of different process groups initializers = [] initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args)) - initializers.append(pgroup_initializer.Initializer_Sequence(*initializer_args)) + if parallel_config["tensor"]["mode"] == "isp": + initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Data(*initializer_args)) - initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) - initializers.append(pgroup_initializer.Initializer_Model(*initializer_args)) + # if self.weight_parallel_size <= 1: + # initializers.append(pgroup_initializer.Initializer_Model(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args)) - initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) + if parallel_config["tensor"]["mode"] != "isp": + initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) + else: + initializers.append(pgroup_initializer.Initializer_Zero1_ISP(*initializer_args)) if isinstance(self.config.parallel.zero1, dict) and self.config.parallel.zero1.get("fsdp", False): initializers.append(pgroup_initializer.Initializer_Zero3_dp(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args)) diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py index ee81ac58..5e59df22 100644 --- a/internlm/core/context/process_group_initializer.py +++ b/internlm/core/context/process_group_initializer.py @@ -67,10 +67,14 @@ class ProcessGroupInitializer(ABC): Args: rank (int): The rank of current process. world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. + sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ @@ -79,6 +83,7 @@ def __init__( rank: int, world_size: int, weight_parallel_size: int, + weight_data_parallel_size: int, sequence_parallel_size: int, data_parallel_size: int, pipeline_parallel_size: int, @@ -90,6 +95,7 @@ def __init__( self.rank = rank self.world_size = world_size self.weight_parallel_size = weight_parallel_size + self.weight_data_parallel_size = weight_data_parallel_size self.sequence_parallel_size = sequence_parallel_size self.data_parallel_size = data_parallel_size self.pipeline_parallel_size = pipeline_parallel_size @@ -97,6 +103,8 @@ def __init__( self.zero1_parallel_size = zero1_parallel_size self.nettest_parallel_size = nettest_parallel_size self.expert_parallel_size = expert_parallel_size + + assert sequence_parallel_size == tensor_parallel_size super().__init__() @abstractmethod @@ -104,41 +112,50 @@ def init_dist_group(self, use_cpu: bool = False): pass -# class Initializer_Data(ProcessGroupInitializer): -# """A ProcessGroupInitializer for data parallelism. +# class Initializer_Model(ProcessGroupInitializer): +# """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel +# groups). # Args: # rank (int): The rank of current process. # world_size (int): Size of whole communication world. +# weight_parallel_size (int): Size of model weight parallel. +# weight_data_parallel_size (int): Size of data parallel for common weight. +# sequence_parallel_size (int): Size of data sequence parallel. # data_parallel_size (int): Size of data parallel. # pipeline_parallel_size (int): Size of pipeline parallel. # tensor_parallel_size (int): Size of tensor parallel. # zero1_parallel_size (int): Size of zero1 parallel. +# nettest_parallel_size (int): Size of net testing parallel. # expert_parallel_size (int): Size of expert parallel. # """ # def __init__(self, *args, **kwargs): # super().__init__(*args, **kwargs) -# self.rank_num_per_dp_group = self.world_size // self.data_parallel_size -# assert self.world_size % self.data_parallel_size == 0 +# # only for msp or fsp +# assert self.weight_parallel_size == 1 +# self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size +# self.num_group = self.world_size // self.rank_num_per_group + +# assert self.world_size % self.rank_num_per_group == 0 # def init_dist_group(self, use_cpu: bool = False): -# """Initialize data parallel groups, and assign local_ranks and groups to each gpu. +# """Initialize model parallel groups, and assign local_ranks and groups to each gpu. # Returns: # Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): -# A Data parallelism's information tuple. +# A Model parallelism's information tuple. # """ # local_rank = None # ranks_in_group = None # process_group = None # cpu_group = None # group_world_size = None -# mode = ParallelMode.DATA +# mode = ParallelMode.MODEL -# for i in range(self.rank_num_per_dp_group): -# ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)] +# for i in range(self.num_group): +# ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)] # group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) # if use_cpu: # group_cpu = ( @@ -159,138 +176,92 @@ def init_dist_group(self, use_cpu: bool = False): # return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode -class Initializer_Model(ProcessGroupInitializer): - """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel - groups). +class Initializer_Pipeline(ProcessGroupInitializer): + """A ProcessGroupInitializer for pipeline parallelism. Args: rank (int): The rank of current process. world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. + sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size - self.num_group = self.world_size // self.rank_num_per_group + self.num_pp_group = self.world_size // self.pipeline_parallel_size - assert self.world_size % self.rank_num_per_group == 0 + assert self.world_size % self.pipeline_parallel_size == 0 def init_dist_group(self, use_cpu: bool = False): - """Initialize model parallel groups, and assign local_ranks and groups to each gpu. + """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu. Returns: - Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): - A Model parallelism's information tuple. + List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]: + A Pipeline parallelism's information in list of tuples. + + n=16 tp/sp=4 pp=2 dp=2 wp=8 + wp grops: [0-7] [8-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + + n=16 tp/sp=4 pp=2 dp=2 wp=2 + wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] """ local_rank = None ranks_in_group = None process_group = None cpu_group = None group_world_size = None - mode = ParallelMode.MODEL + mode = ParallelMode.PIPELINE - for i in range(self.num_group): - ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + for i in range(self.num_pp_group): + ranks = [i + j * self.num_pp_group for j in range(self.pipeline_parallel_size)] + pipe_group_size = len(ranks) + pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) if use_cpu: group_cpu = ( dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) if dist.get_backend() != "gloo" - else group + else pipe_group ) else: group_cpu = None if self.rank in ranks: local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group + group_world_size = pipe_group_size + process_group = pipe_group cpu_group = group_cpu ranks_in_group = ranks return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode -class Initializer_Pipeline(ProcessGroupInitializer): - """A ProcessGroupInitializer for pipeline parallelism. - - Args: - rank (int): The rank of current process - world_size (int): Size of whole communication world - data_parallel_size (int): Size of data parallel - pipeline_parallel_size (int): Size of pipeline parallel - tensor_parallel_size (int): Size of tensor parallel - zero1_parallel_size (int): Size of zero1 parallel. - expert_parallel_size (int): Size of expert parallel. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.rank_num_per_dp_group = self.world_size // self.data_parallel_size - self.pipeline_stage_size = self.rank_num_per_dp_group // self.pipeline_parallel_size - - assert self.world_size % self.data_parallel_size == 0 - assert self.rank_num_per_dp_group % self.pipeline_parallel_size == 0 - - def init_dist_group(self, use_cpu: bool = False): - """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu. - - Returns: - List[Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)]: - A Pipeline parallelism's information in list of tuples. - """ - local_rank = None - ranks_in_group = None - process_group = None - cpu_group = None - group_world_size = None - mode = ParallelMode.PIPELINE - - for i in range(self.data_parallel_size): - for j in range(self.pipeline_stage_size): - ranks = list( - range( - i * self.rank_num_per_dp_group + j, - (i + 1) * self.rank_num_per_dp_group, - self.pipeline_stage_size, - ) - ) - pipe_group_size = len(ranks) - pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else pipe_group - ) - else: - group_cpu = None - - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = pipe_group_size - process_group = pipe_group - cpu_group = group_cpu - ranks_in_group = ranks - - return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode - - class Initializer_Tensor(ProcessGroupInitializer): """A ProcessGroupInitializer for tensor parallelism. Args: rank (int): The rank of current process. world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. + sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ @@ -343,21 +314,106 @@ class Initializer_Zero1(ProcessGroupInitializer): rank (int): The rank of current process. world_size (int): Size of whole communication world. weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. - zero1_parallel_size (int): Size of zero-1 parallel. + zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. + expert_parallel_size (int): Size of expert parallel. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.tensor_zero1_size = self.tensor_parallel_size * self.zero1_parallel_size + self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size + self.num_tensor_zero1_parallel_group = self.ranks_num_per_pp // self.tensor_zero1_size + + assert self.world_size % (self.tensor_parallel_size * self.zero1_parallel_size) == 0 + assert self.world_size % self.pipeline_parallel_size == 0 + + def init_dist_group(self, use_cpu: bool = False): + """Initialize zero1 parallel groups, and assign local_ranks and groups to each gpu. + + Returns: + Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): + A zero1 parallelism's information tuple. + + n=16 tp/sp=4 pp=2 dp=2 zero1=2 + tp/sp grops: [0-3] [4-7] [8-11] [12-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + zero1 groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + + n=16 tp/sp=2 pp=2 dp=4 zero1=2 + tp/sp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15] + data groups: [0,2,4,6] [1,3,5,7] + [8,10,12,14] [9,11,13,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + zero1 groups: [0,2] [1,3] [4,6] [5,7] + [8,10] [9,11] [12,14] [13,15] + """ + local_rank = None + ranks_in_group = None + process_group = None + cpu_group = None + group_world_size = None + mode = ParallelMode.ZERO1 + + for i in range(self.pipeline_parallel_size): + for j in range(self.num_tensor_zero1_parallel_group): + for k in range(self.tensor_parallel_size): + ranks = [ + i * self.ranks_num_per_pp + j * self.tensor_zero1_size + k + m * self.tensor_parallel_size + for m in range(self.zero1_parallel_size) + ] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks + + return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode + + +class Initializer_Zero1_ISP(ProcessGroupInitializer): + """A ProcessGroupInitializer for zero-1 parallelism. + + Args: + rank (int): The rank of current process. + world_size (int): Size of whole communication world. + weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. + sequence_parallel_size (int): Size of data sequence parallel. + data_parallel_size (int): Size of data parallel. + pipeline_parallel_size (int): Size of pipeline parallel. + tensor_parallel_size (int): Size of tensor parallel. + zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.num_zero1_parallel_group = self.world_size // self.zero1_parallel_size self.weight_zero1_size = self.weight_parallel_size * self.zero1_parallel_size - self.num_weight_zero1_parallel_group = self.world_size // self.weight_zero1_size + self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size + self.num_weight_zero1_parallel_group = self.ranks_num_per_pp // self.weight_zero1_size - assert self.world_size % self.zero1_parallel_size == 0 + assert self.world_size % (self.pipeline_parallel_size * self.zero1_parallel_size) == 0 assert self.world_size % self.weight_zero1_size == 0 def init_dist_group(self, use_cpu: bool = False): @@ -371,6 +427,23 @@ def init_dist_group(self, use_cpu: bool = False): wp grops: [0-7] [8-15] [16-23] [24-31] zo1 groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] [16,24] [17,25] [18,26] [19,27] [20,28] [21,29] [22,30] [23,31] + + n=16 tp/sp=4 pp=2 dp=2 wp=8 wdp=1 zero1=1 + wp grops: [0-7] [8-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + wdp groups: [...] + + n=16 tp/sp=4 pp=2 dp=2 wp=2 wdp=4 zero1=2 + wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + wdp groups: [0,2,4,6] [1,3,5,7] + [8,10,12,14] [9,11,13,15] + zero1 groups: [0,2] [1,3] [4,6] [5,7] + [8,10] [9,11] [12,14] [13,15] + zero1=4: [0,2,4,6] [1,3,5,7] [8,10,12,14] [9,11,13,15] """ local_rank = None ranks_in_group = None @@ -379,28 +452,29 @@ def init_dist_group(self, use_cpu: bool = False): group_world_size = None mode = ParallelMode.ZERO1 - for i in range(self.num_weight_zero1_parallel_group): - for j in range(self.weight_parallel_size): - ranks = [ - i * self.weight_zero1_size + j + k * self.weight_parallel_size - for k in range(self.zero1_parallel_size) - ] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None - - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks + for i in range(self.pipeline_parallel_size): + for j in range(self.num_weight_zero1_parallel_group): + for k in range(self.weight_parallel_size): + ranks = [ + i * self.ranks_num_per_pp + j * self.weight_zero1_size + k + m * self.weight_parallel_size + for m in range(self.zero1_parallel_size) + ] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None + + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode @@ -688,11 +762,13 @@ class Initializer_Weight(ProcessGroupInitializer): rank (int): The rank of current process. world_size (int): Size of whole communication world. weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ @@ -738,63 +814,6 @@ def init_dist_group(self, use_cpu: bool = False): return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode -class Initializer_Sequence(ProcessGroupInitializer): - """A ProcessGroupInitializer for data sequence parallelism. - - Args: - rank (int): The rank of current process. - world_size (int): Size of whole communication world. - weight_parallel_size (int): Size of model weight parallel. - sequence_parallel_size (int): Size of data sequence parallel. - data_parallel_size (int): Size of data parallel. - pipeline_parallel_size (int): Size of pipeline parallel. - tensor_parallel_size (int): Size of tensor parallel. - zero1_parallel_size (int): Size of zero1 parallel. - expert_parallel_size (int): Size of expert parallel. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.num_sequence_parallel_group = self.world_size // self.sequence_parallel_size - - assert self.world_size % self.sequence_parallel_size == 0 - - def init_dist_group(self, use_cpu: bool = False): - """Initialize data sequence parallel groups, and assign local_ranks and groups to each gpu. - - Returns: - Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): - A Sequence parallelism's information tuple. - """ - local_rank = None - ranks_in_group = None - process_group = None - cpu_group = None - group_world_size = None - mode = ParallelMode.SEQUENCE - - for i in range(self.num_sequence_parallel_group): - ranks = [i * self.sequence_parallel_size + j for j in range(self.sequence_parallel_size)] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None - - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks - - return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode - - class Initializer_Data(ProcessGroupInitializer): """A ProcessGroupInitializer for data parallelism. @@ -802,20 +821,24 @@ class Initializer_Data(ProcessGroupInitializer): rank (int): The rank of current process. world_size (int): Size of whole communication world. weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.num_dp_group = self.sequence_parallel_size + self.num_dp_group = self.pipeline_parallel_size * self.sequence_parallel_size + self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size assert self.world_size % self.data_parallel_size == 0 assert self.world_size % self.sequence_parallel_size == 0 + assert self.world_size % self.pipeline_parallel_size == 0 def init_dist_group(self, use_cpu: bool = False): """Initialize data parallel groups, and assign local_ranks and groups to each gpu. @@ -824,9 +847,10 @@ def init_dist_group(self, use_cpu: bool = False): Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): A Data parallelism's information tuple. - n=32 wp=8 sp=4 zo1=2 - wp grops: [0-7] [8-15] [16-23] [24-31] - data groups: [0,4,8,12,16,20,24,28] [1,5,9,13,17,21,25,29] [2,6,10,14,18,22,26,30] [3,7,11,15,19,23,27,31] + n=16 tp/sp=4 pp=2 dp=2 wp=8 + wp grops: [0-7] [8-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] """ local_rank = None ranks_in_group = None @@ -835,24 +859,28 @@ def init_dist_group(self, use_cpu: bool = False): group_world_size = None mode = ParallelMode.DATA - for i in range(self.num_dp_group): - ranks = [i + j * self.sequence_parallel_size for j in range(self.data_parallel_size)] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None + for i in range(self.pipeline_parallel_size): + for j in range(self.sequence_parallel_size): + ranks = [ + i * self.ranks_num_per_pp + j + k * self.sequence_parallel_size + for k in range(self.data_parallel_size) + ] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode @@ -864,20 +892,23 @@ class Initializer_Weight_Data(ProcessGroupInitializer): rank (int): The rank of current process. world_size (int): Size of whole communication world. weight_parallel_size (int): Size of model weight parallel. + weight_data_parallel_size (int): Size of data parallel for common weight. sequence_parallel_size (int): Size of data sequence parallel. data_parallel_size (int): Size of data parallel. pipeline_parallel_size (int): Size of pipeline parallel. tensor_parallel_size (int): Size of tensor parallel. zero1_parallel_size (int): Size of zero1 parallel. + nettest_parallel_size (int): Size of net testing parallel. expert_parallel_size (int): Size of expert parallel. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.num_weight_dp_group = self.weight_parallel_size - self.weight_data_parallel_size = self.world_size // self.num_weight_dp_group + self.num_wdp_group_per_pp = self.world_size // self.pipeline_parallel_size // self.weight_data_parallel_size + self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size - assert self.world_size % self.weight_parallel_size == 0 + assert self.world_size % self.pipeline_parallel_size == 0 + assert self.world_size % (self.pipeline_parallel_size * self.weight_data_parallel_size) == 0 def init_dist_group(self, use_cpu: bool = False): """Initialize weight's data parallel groups, and assign local_ranks and groups to each gpu. @@ -886,10 +917,24 @@ def init_dist_group(self, use_cpu: bool = False): Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): A WEIGHT_DATA parallelism's information tuple. - n=32 wp=8 sp=4 zo1=2 + n=32 wp=8 sp=4 zo1=2 with nopp wp grops: [0-7] [8-15] [16-23] [24-31] weight data groups: [0,8,16,24] [1,9,17,25] [2,10,18,26] [3,11,19,27] [4,12,20,28] [5,13,21,29] [6,14,22,30] [7,15,23,31] + + n=16 tp/sp=4 pp=2 dp=2 wp=8 wdp=1 + wp grops: [0-7] [8-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + wdp groups: [...] + + n=16 tp/sp=4 pp=2 dp=2 wp=2 wdp=4 + wp grops: [0-1] [2-3] [4-5] [6-7] [8-9] [10-11] [12-13] [14-15] + data groups: [0,4] [1,5] [2,6] [3,7] + [8,12] [9,13] [10,14] [11,15] + pp groups: [0,8] [1,9] [2,10] [3,11] [4,12] [5,13] [6,14] [7,15] + wdp groups: [0,2,4,6] [1,3,5,7] + [8,10,12,14] [9,11,13,15] """ local_rank = None ranks_in_group = None @@ -898,23 +943,27 @@ def init_dist_group(self, use_cpu: bool = False): group_world_size = None mode = ParallelMode.WEIGHT_DATA - for i in range(self.num_weight_dp_group): - ranks = [i + j * self.weight_parallel_size for j in range(self.weight_data_parallel_size)] - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None + for i in range(self.pipeline_parallel_size): + for j in range(self.num_wdp_group_per_pp): + ranks = [ + i * self.ranks_num_per_pp + j + k * self.weight_parallel_size + for k in range(self.weight_data_parallel_size) + ] + group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) + if use_cpu: + group_cpu = ( + dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) + if dist.get_backend() != "gloo" + else group + ) + else: + group_cpu = None - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks + if self.rank in ranks: + local_rank = ranks.index(self.rank) + group_world_size = len(ranks) + process_group = group + cpu_group = group_cpu + ranks_in_group = ranks return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 208af18f..af4c9698 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -79,6 +79,9 @@ def args_sanity_check(): if "tensor" not in gpc.config.parallel: gpc.config.parallel._add_item("tensor", 1) + if "weight" not in gpc.config.parallel: + gpc.config.parallel._add_item("weight", dict(size=1, overlap=False, memory_pool=False)) + if isinstance(gpc.config.parallel.pipeline, int): pp = gpc.config.parallel.pipeline else: @@ -307,22 +310,30 @@ def args_sanity_check(): gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False ), "sequence parallel does not support use_flash_attn=False" + # set default value for tensor parallel if isinstance(gpc.config.parallel["tensor"], int): - gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], sp="none", intern_overlap=False) - if gpc.config.parallel["tensor"].get("sp", None) is None: - gpc.config.parallel["tensor"]["sp"] = "none" - if gpc.config.parallel["tensor"].get("intern_overlap", None) is None: - gpc.config.parallel["tensor"]["intern_overlap"] = False - assert gpc.config.parallel["tensor"].get("sp", None) in [ - "none", - "megatron", - "flash-attn", - "intern", - ], "invalid sp mode, only ['none', 'megatron', 'flash-attn', 'intern'] is supported" + gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="mtp") + if gpc.config.parallel["tensor"].get("mode", None) is None: + gpc.config.parallel["tensor"]["mode"] = "mtp" + assert gpc.config.parallel["tensor"].get("mode", None) in [ + "mtp", + "msp", + "fsp", + "isp", + ], "invalid tensor parallel mode, only ['mtp', 'msp', 'fsp', 'isp'] is supported" + # adapt to old version's sequence parallel config - if gpc.config.parallel["tensor"].get("sp", None) in ["megatron", "flash-attn", "intern"]: + if gpc.config.parallel["tensor"].get("mode", None) in ["msp", "fsp", "isp"]: gpc.config.parallel.sequence_parallel = True + # set default value for weight parallel + if gpc.config.parallel["weight"].get("overlap", None) is None: + gpc.config.parallel["weight"]["overlap"] = False + if gpc.config.parallel["weight"].get("memory_pool", None) is None: + gpc.config.parallel["weight"]["memory_pool"] = False + if gpc.config.parallel["tensor"]["mode"] != "isp": + assert gpc.config.parallel["weight"]["size"] <= 1, "weight parallel is only supported with isp" + # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1: assert ( diff --git a/train.py b/train.py index 996d7465..b64d3011 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +from pickle import FALSE import socket import time import traceback @@ -341,6 +342,33 @@ def main(args): initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed) assert hasattr(gpc, "config") and gpc.config is not None + print( + f"ht debug rank:{gpc.get_global_rank()} ranks_in_tp_group:{gpc.get_ranks_in_group(ParallelMode.TENSOR)}", + flush=True, + ) + print( + f"ht debug rank:{gpc.get_global_rank()} ranks_in_wp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT)}", + flush=True, + ) + print( + f"ht debug rank:{gpc.get_global_rank()} ranks_in_dp_group:{gpc.get_ranks_in_group(ParallelMode.DATA)}", + flush=True, + ) + print( + f"ht debug rank:{gpc.get_global_rank()} ranks_in_pp_group:{gpc.get_ranks_in_group(ParallelMode.PIPELINE)}", + flush=True, + ) + # print( + # f"ht debug rank:{gpc.get_global_rank()} ranks_in_wdp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)}", + # flush=True, + # ) + print( + f"ht debug rank:{gpc.get_global_rank()} ranks_in_zero1_group:{gpc.get_ranks_in_group(ParallelMode.ZERO1)}", + flush=True, + ) + + assert False + # initialize monitor manager context with initialize_monitor_manager( job_name=gpc.config.JOB_NAME, alert_address=gpc.config.monitor.alert.feishu_alert_address From e9cd5210a18c8aa66e24d1baea7121313cc3b25d Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 20 Dec 2023 18:56:20 +0800 Subject: [PATCH 087/153] feat(model): refactor model and optimizer for msp/fsp/isp --- internlm/core/context/__init__.py | 11 +- internlm/core/context/parallel_context.py | 31 +- internlm/core/naive_amp.py | 2 +- internlm/core/scheduler/pipeline_scheduler.py | 3 +- internlm/data/batch_sampler.py | 2 +- internlm/initialize/initialize_trainer.py | 4 +- internlm/model/embedding.py | 8 +- internlm/model/linear.py | 16 +- internlm/model/loss.py | 2 +- internlm/model/modeling_internlm.py | 75 +- internlm/model/modeling_moe.py | 2 +- internlm/model/multi_head_attention.py | 21 +- internlm/solver/optimizer/__init__.py | 1 - .../solver/optimizer/hybrid_zero_optim.py | 91 +- .../solver/optimizer/hybrid_zero_optim2.py | 1018 ----------------- internlm/solver/optimizer/utils.py | 64 +- internlm/train/training_internlm.py | 83 +- internlm/train/utils.py | 7 +- internlm/utils/evaluation.py | 6 +- internlm/utils/parallel.py | 78 +- tests/test_training/test_loss.py | 6 +- train.py | 29 +- 22 files changed, 264 insertions(+), 1296 deletions(-) delete mode 100644 internlm/solver/optimizer/hybrid_zero_optim2.py diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index 5382837e..13da8f58 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -1,8 +1,7 @@ from .parallel_context import ( - IS_SEQUENCE_PARALLEL, - IS_TENSOR_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, IS_REPLICA_ZERO_PARALLEL, - IS_SEQUENCE_DATA_PARALLEL, IS_WEIGHT_ZERO_PARALLEL, Config, ParallelContext, @@ -31,8 +30,10 @@ __all__ = [ "Config", - "IS_TENSOR_PARALLEL", - "IS_SEQUENCE_PARALLEL", + "IS_TENSOR_ZERO_PARALLEL", + "IS_TENSOR_DATA_PARALLEL", + "IS_REPLICA_ZERO_PARALLEL", + "IS_WEIGHT_ZERO_PARALLEL", "global_context", "ParallelContext", "ParallelMode", diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 538d3947..53416761 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -24,10 +24,12 @@ from .process_group_initializer import ParallelMode from .random import add_seed, get_seeds, set_mode -IS_TENSOR_PARALLEL = "is_tensor_parallel" -IS_SEQUENCE_PARALLEL = "is_sequence_parallel" + IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel" -IS_SEQUENCE_DATA_PARALLEL = "is_sequence_data_parallel" +# for isp, with optimizer split in dp group +IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel" +# for mtp/msp/fsp, with optimizer split in zero1 group +IS_TENSOR_ZERO_PARALLEL = "is_tensor_zero_parallel" IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel" logger = get_logger(__file__) @@ -249,30 +251,11 @@ def get_prev_global_rank(self, parallel_mode: ParallelMode): return ranks_in_group[(local_rank - 1) % world_size] - def is_using_dp(self): + def is_using_parallel_mode(self, parallel_mode): """Returns a boolean value indicating whether the current device is initilized with ParallelMode.DATA and its world_size is greater than 1. """ - return self.is_initialized(ParallelMode.DATA) and self.get_world_size(ParallelMode.DATA) > 1 - - def is_using_tp(self): - """Returns a boolean value indicating whether the current device is initilized with - ParallelMode.TENSOR and its world_size is greater than 1. - """ - return self.is_initialized(ParallelMode.TENSOR) and self.get_world_size(ParallelMode.TENSOR) > 1 - - def is_using_pp(self): - """Returns a boolean value indicating whether the current device is initilized with - ParallelMode.PIPELINE and its world_size is greater than 1. - """ - return self.is_initialized(ParallelMode.PIPELINE) and self.get_world_size(ParallelMode.PIPELINE) > 1 - - def is_using_sequence(self): - """Returns a boolean value indicating whether the current device is initilized with - ParallelMode.SEQUENCE and its world_size is greater than 1. - """ - return False - # return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1 + return self.is_initialized(parallel_mode) and self.get_world_size(parallel_mode) > 1 def is_first_rank(self, parallel_mode: ParallelMode): """Returns a boolean value indicating whether the current device is the first one diff --git a/internlm/core/naive_amp.py b/internlm/core/naive_amp.py index fb04759b..ffd413b0 100644 --- a/internlm/core/naive_amp.py +++ b/internlm/core/naive_amp.py @@ -51,7 +51,7 @@ def __init__( self._sync_buf = sync_buffer self.dtype = dtype - if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1: + if gpc.is_using_parallel_mode(parallel_mode): self._process_group = gpc.get_group(parallel_mode) self._world_size = gpc.get_world_size(parallel_mode) else: diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index efc9187a..622c91f6 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -135,8 +135,7 @@ def __init__( self.scatter_gather_tensors = ( scatter_gather_tensors - and gpc.is_initialized(ParallelMode.TENSOR) - and gpc.get_world_size(ParallelMode.TENSOR) > 1 + and gpc.is_using_parallel_mode(ParallelMode.TENSOR) ) if gpc.config.parallel.sequence_parallel: diff --git a/internlm/data/batch_sampler.py b/internlm/data/batch_sampler.py index 16fd6fce..a94a7210 100644 --- a/internlm/data/batch_sampler.py +++ b/internlm/data/batch_sampler.py @@ -141,7 +141,7 @@ def get_dpsampler_dataloader( """ _kwargs = kwargs.copy() - if add_sampler and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1: + if add_sampler and gpc.is_using_parallel_mode(ParallelMode.DATA): sampler = DataParallelSampler(dataset, shuffle=shuffle, drop_last=drop_last) else: sampler = None diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py index beb4a40f..7893dd54 100644 --- a/internlm/initialize/initialize_trainer.py +++ b/internlm/initialize/initialize_trainer.py @@ -68,7 +68,7 @@ def initialize_trainer( assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer" # gradient handler, only support PipelineSharedModuleGradientHandler now - if gpc.is_using_pp(): + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): gpc.config.gradient_handler = [dict(type="PipelineSharedModuleGradientHandler")] gradient_handler_cfg = gpc.config.get("gradient_handler", []) gradient_handlers = [] @@ -84,7 +84,7 @@ def initialize_trainer( data_fn = None else: data_fn = unpack_data - if gpc.is_using_pp(): + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): gpc.config.NUM_MICRO_BATCHES = gpc.config.data.micro_num tensor_shape = get_tensor_shape() use_interleaved = ( diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py index 225a5f16..11c71b2c 100644 --- a/internlm/model/embedding.py +++ b/internlm/model/embedding.py @@ -44,7 +44,7 @@ def __init__( self.num_embeddings = num_embeddings self.embed_dim = embedding_dim - embed_dim_per_partition = embedding_dim // gpc.sequence_parallel_size + embed_dim_per_partition = embedding_dim // gpc.tensor_parallel_size self.padding_idx = padding_idx self.embed_args = args @@ -55,10 +55,10 @@ def __init__( def forward(self, input_: Tensor) -> Tensor: output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs) - output = gather_forward_split_backward(output_parallel, ParallelMode.SEQUENCE, dim=-1) + output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1) - if gpc.config.parallel.sequence > 1: - output = split_forward_gather_backward(output, ParallelMode.SEQUENCE, dim=1) + if gpc.config.parallel.sequence_parallel: + output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1) # print( # f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}", # flush=True, diff --git a/internlm/model/linear.py b/internlm/model/linear.py index b92b2ee5..fc5175d9 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -403,28 +403,28 @@ def __init__( ) -def get_mlp_cls(sp_mode: str): - if sp_mode in ["none", "flash-attn"]: +def get_mlp_cls(tp_mode: str): + if tp_mode in ["mtp", "fsp"]: mlp_cls = FeedForward - elif sp_mode == "megatron": + elif tp_mode == "msp": mlp_cls = MegatronFeedForward else: mlp_cls = FSTPFeedForward return mlp_cls -def get_linear_cls(sp_mode: str, parallel_mode: str): +def get_linear_cls(tp_mode: str, parallel_mode: str): if parallel_mode == "column": - if sp_mode in ["none", "flash-attn"]: + if tp_mode in ["mtp", "fsp"]: cls = ColumnParallelLinearTorch - elif sp_mode == "megatron": + elif tp_mode == "msp": cls = MegatronColumnParallelLinearTorch else: cls = FSTPLinear elif parallel_mode == "row": - if sp_mode in ["none", "flash-attn"]: + if tp_mode in ["mtp", "fsp"]: cls = RowParallelLinearTorch - elif sp_mode == "megatron": + elif tp_mode == "msp": cls = MegatronRowParallelLinearTorch else: cls = FSTPLinear diff --git a/internlm/model/loss.py b/internlm/model/loss.py index a634d2c7..ac92b4b9 100644 --- a/internlm/model/loss.py +++ b/internlm/model/loss.py @@ -28,7 +28,7 @@ def __init__(self, parallel_output=True, label_smoothing=0): self.loss_fn = FlashCrossEntropyLoss( reduction="mean", inplace_backward=True, - process_group=gpc.get_group(ParallelMode.SEQUENCE), + process_group=gpc.get_group(ParallelMode.TENSOR), label_smoothing=label_smoothing, ) # The loss in this place is bound to the gather_output initialized by VocabParallelClassifier1D else: diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 400ad273..032fef91 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -9,14 +9,7 @@ from flash_attn.modules.mlp import ParallelFusedMLP from torch import nn -from internlm.core.context import ( - IS_SEQUENCE_PARALLEL, - IS_TENSOR_PARALLEL, - IS_REPLICA_ZERO_PARALLEL, - IS_SEQUENCE_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, - ParallelMode, -) +from internlm.core.context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal from internlm.model.embedding import Embedding1D @@ -85,7 +78,7 @@ def __init__( use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, - sp_mode: str = "none", + tp_mode: str = "mtp", ): super().__init__() self.checkpoint = checkpoint @@ -95,11 +88,13 @@ def __init__( self.use_flash_attn = use_flash_attn head_dim = hidden_size // num_attention_heads + self.tp_mode = tp_mode + parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR self.mixer = MHA( embed_dim=hidden_size, num_heads=num_attention_heads, - process_group=gpc.get_group(ParallelMode.WEIGHT), - sequence_process_group=gpc.get_group(ParallelMode.SEQUENCE), + process_group=gpc.get_group(parallel_mode), + sequence_process_group=gpc.get_group(ParallelMode.TENSOR), dropout=attn_drop_rate, max_position_embeddings=max_position_embeddings, softmax_scale=1 / math.sqrt(head_dim), @@ -111,7 +106,7 @@ def __init__( use_flash_attn=use_flash_attn, device=device, dtype=dtype, - sp_mode=sp_mode, + tp_mode=self.tp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -123,12 +118,12 @@ def __init__( self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) if use_swiglu: - mlp_cls = get_mlp_cls(sp_mode) + mlp_cls = get_mlp_cls(self.tp_mode) self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.WEIGHT), + process_group=gpc.get_group(parallel_mode), bias=False, device=device, dtype=dtype, @@ -139,7 +134,7 @@ def __init__( int(hidden_size * mlp_ratio), out_features=hidden_size, activation="gelu_approx", - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), bias1=False, bias2=False, sequence_parallel=gpc.config.parallel.sequence_parallel, @@ -148,23 +143,6 @@ def __init__( device=device, dtype=dtype, ) - for _, param in self.mlp.named_parameters(): - # if gpc.get_world_size(ParallelMode.TENSOR) > 1: - # setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.WEIGHT) > 1: - setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) - for param in self.norm1.parameters(): - # if gpc.config.parallel.sequence_parallel is True: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - # if gpc.config.parallel.weight.size > 1: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - setattr(param, IS_REPLICA_ZERO_PARALLEL, True) - for param in self.norm2.parameters(): - # if gpc.config.parallel.sequence_parallel is True: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - # if gpc.config.parallel.weight.size > 1: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - setattr(param, IS_REPLICA_ZERO_PARALLEL, True) self.dropout2 = nn.Dropout(drop_rate) self.use_swiglu = use_swiglu @@ -327,18 +305,14 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) - self.sp_mode = gpc.config.parallel["tensor"]["sp"] - if self.sp_mode == "none": - gpc.config.parallel.sequence_parallel = False - else: - gpc.config.parallel.sequence_parallel = True + self.tp_mode = gpc.config.parallel.tensor.mode if is_reward: head_cls = RewardModelLinear else: head_cls = ( ScaleColumnParallelLinear - if self.sp_mode in ["flash-attn", "none", "intern"] + if self.tp_mode in ["mtp", "fsp", "isp"] else MegatronScaleColumnParallelLinear ) if first: @@ -357,11 +331,8 @@ def __init__( ) for _, param in self.embedding.named_parameters(): normal_(std=0.0052)(param) - # if gpc.get_world_size(ParallelMode.TENSOR) > 1: - # setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.SEQUENCE) > 1: - setattr(param, IS_SEQUENCE_DATA_PARALLEL, True) self.embed_grad_scale = embed_grad_scale + self.blocks = nn.ModuleList( [ PackedFlashBaseLayer1D( @@ -383,7 +354,7 @@ def __init__( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, - sp_mode=self.sp_mode, + tp_mode=self.tp_mode, ) for lid in range(num_layers) ] @@ -396,7 +367,7 @@ def __init__( self.head = head_cls( in_features=hidden_size, out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size, - process_group=gpc.get_group(ParallelMode.SEQUENCE), + process_group=gpc.get_group(ParallelMode.TENSOR), bias=False, device=device, dtype=dtype, @@ -404,16 +375,6 @@ def __init__( ) for _, param in self.head.named_parameters(): normal_(std=0.0052)(param) - # if gpc.get_world_size(ParallelMode.TENSOR) > 1: - # setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.SEQUENCE) > 1: - setattr(param, IS_SEQUENCE_DATA_PARALLEL, True) - for param in self.norm.parameters(): - # if gpc.config.parallel.sequence_parallel is True: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - # if gpc.config.parallel.weight.size > 1: - # setattr(param, IS_SEQUENCE_PARALLEL, True) - setattr(param, IS_REPLICA_ZERO_PARALLEL, True) self.parallel_output = parallel_output @@ -438,11 +399,9 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] - # if the sequence parallel mode is 'intern', the indexes should also be split in sequence dimension. - if gpc.config.parallel.sequence_parallel and self.sp_mode == "intern": + # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension. + if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp": indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) - if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1: - indexes = split_forward_gather_backward(indexes, ParallelMode.SEQUENCE, dim=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index 43489bc4..9d9f3238 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -9,7 +9,7 @@ from flash_attn.modules.mlp import ParallelFusedMLP from torch import nn -from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode +from internlm.core.context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.core.naive_amp import set_fp32_attr_to_module from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 5d9e0a40..fb0309a5 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -10,8 +10,6 @@ import torch.nn.functional as F from einops import rearrange -from internlm.core.context import IS_WEIGHT_ZERO_PARALLEL - try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func except ImportError: @@ -37,7 +35,7 @@ from torch import Tensor, nn from torch.nn import Module -from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode +from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding from internlm.model.linear import get_linear_cls @@ -174,7 +172,7 @@ def __init__( use_flash_attn: bool = True, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - sp_mode: str = "none", + tp_mode: str = "mtp", ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -202,7 +200,7 @@ def __init__( self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device) # notice here should change bias=True - Wqkv_cls = get_linear_cls(sp_mode, "column") + Wqkv_cls = get_linear_cls(tp_mode, "column") self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, @@ -218,14 +216,14 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - if sp_mode == "intern": + if tp_mode == "isp": self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group) self.inner_cross_attn = DistributedAttention( self.inner_cross_attn, sequence_process_group=sequence_process_group ) # output projection always have the bias (for now) - out_proj_cls = get_linear_cls(sp_mode, "row") + out_proj_cls = get_linear_cls(tp_mode, "row") self.out_proj = out_proj_cls( embed_dim, embed_dim, @@ -234,15 +232,6 @@ def __init__( sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) - # need to assign tp attribute so that internlm know it is tensor parallel module - # if gpc.get_world_size(ParallelMode.TENSOR) > 1: - # for name in ["out_proj", "Wqkv"]: - # for param in getattr(self, name).parameters(): - # setattr(param, IS_TENSOR_PARALLEL, True) - if gpc.get_world_size(ParallelMode.WEIGHT) > 1: - for name in ["out_proj", "Wqkv"]: - for param in getattr(self, name).parameters(): - setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) def forward(self, x, seqlen=None, inference_params=None, **kwargs): if kwargs.get("indexes", None) is not None: diff --git a/internlm/solver/optimizer/__init__.py b/internlm/solver/optimizer/__init__.py index 309f2295..7c6a1c64 100644 --- a/internlm/solver/optimizer/__init__.py +++ b/internlm/solver/optimizer/__init__.py @@ -3,6 +3,5 @@ from .fsdp_optimizer import FSDPadaptOptimizer from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff -from .hybrid_zero_optim2 import HybridZeroOptimizer2 __all__ = ["FSDPadaptOptimizer", "HybridZeroOptimizer", "reload_zero_fp32_buff"] diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 3092a625..681dfc9c 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -9,8 +9,13 @@ import torch.distributed as dist from torch.optim import Optimizer -from internlm.core.context import IS_SEQUENCE_PARALLEL, Config, ParallelMode +from internlm.core.context import IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode from internlm.core.context import global_context as gpc +from internlm.core.context.parallel_context import ( + IS_TENSOR_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, +) from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( BucketStore, @@ -71,6 +76,7 @@ def __init__( clip_grad_norm = zero_cfg.clip_grad_norm self._overlap_sync_grad = zero_cfg.overlap_sync_grad self._overlap_sync_param = zero_cfg.overlap_sync_param + self.use_isp = gpc.config.parallel.tensor.mode == "isp" super().__init__(optim=optimizer) @@ -82,7 +88,8 @@ def __init__( # ParameterStore will manage the tensor buffers used for zero # it will not manage the tensors used by mixed precision training self._param_store = ParameterStore(ParallelMode.ZERO1) - self._grad_store = GradientStore(ParallelMode.DATA) + parallel_mode = ParallelMode.WEIGHT_DATA if self.use_isp else ParallelMode.DATA + self._grad_store = GradientStore(parallel_mode) self._bucket_store: List[BucketStore] = [] self._accum_grad_buckets: List[BucketStore] = [] self._bucket_in_progress = [] @@ -120,8 +127,10 @@ def __init__( self.rank_unique_id = ( f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_" - + f"pp-{gpc.get_local_rank(ParallelMode.PIPELINE)}_" + + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_" + f"tp-{gpc.get_local_rank(ParallelMode.TENSOR)}_" + + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_" + + f"pp-{gpc.get_local_rank(ParallelMode.PIPELINE)}_" + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt" ) self.params_per_rank_id_dict = [] @@ -129,7 +138,7 @@ def __init__( if self._overlap_sync_param: assert self._param_bcast_sync_handler is not None - if gpc.config.parallel["tensor"]["sp"] == "intern" and gpc.config.parallel["tensor"]["intern_overlap"] is True: + if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: self._fstp_handler = gpc.fstp_handler else: self._fstp_handler = None @@ -148,17 +157,25 @@ def __init__( # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1 # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode - zero_mode = ( - ParallelMode.ZERO1 - if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA - else ParallelMode.EXPERT_DATA - ) + # zero_mode = ( + # ParallelMode.ZERO1 + # if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA + # else ParallelMode.EXPERT_DATA + # ) + zero_mode = param_group["optimizer_mode"] + self._zero_local_rank.append(gpc.get_local_rank(zero_mode)) self._zero_world_size.append(gpc.get_world_size(zero_mode)) # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name self._broadcast_parallel_mode.append(zero_mode) - self._bucket_store.append(BucketStore(group_id, param_group["dp_mode"])) - self._accum_grad_buckets.append(BucketStore(group_id, param_group["dp_mode"])) + + if param_group["name"] != "embed_head" and self.use_isp: + grad_reduce_mode = ParallelMode.WEIGHT_DATA + else: + grad_reduce_mode = ParallelMode.DATA + + self._bucket_store.append(BucketStore(group_id, grad_reduce_mode)) + self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode)) # assign parameters to ranks the params in the list are sorted params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) @@ -220,8 +237,6 @@ def __init__( # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled. self.skip_grad_reduce = False - # reduction hook is only used if overlapping communication - # if it is stage 1 without overlapping, no hook will be attached self._attach_reduction_hook() @property @@ -307,12 +322,12 @@ def _define_and_attach(param, reduce_rank=None): reduce_rank=reduce_rank, ) - def reduction_sp_func(): + def reduction_layernorm_func(): handle = reduce_tensor( param.grad, dtype=None, dst_rank=reduce_rank, - parallel_mode=ParallelMode.TENSOR, + parallel_mode=ParallelMode.WEIGHT if self.use_isp else ParallelMode.TENSOR, ) handle.wait() @@ -328,23 +343,24 @@ def accum_grad_hook(*args): # pylint: disable=W0613 reduce_scatter_checker() # define hook for sequence_parallel - def reduce_grad_hook_sp(*args): # pylint: disable=W0613 + def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 if self.skip_grad_reduce is False: - reduction_sp_func() + reduction_layernorm_func() # get the AccumulateGrad object of the param itself # If these objects are not kept, reduction hooks may not be attached successfully. accum_grad_obj = get_grad_accumulate_object(param) self._grad_store.add_accumulate_grad_object(accum_grad_obj) - # if sequence_parallel is True, - # the grad of norm should be all-reduce across the tp process group + # the grad of layernorm should be all-reduce across the global process group + # here is the first stage all-reduce in tp/wp process group + # the second stage all-reduce will be processed in reduce_grad_hook if ( - gpc.config.parallel.sequence_parallel is True - and hasattr(param, IS_SEQUENCE_PARALLEL) - and getattr(param, IS_SEQUENCE_PARALLEL) is True + gpc.config.parallel.weight.size > 1 + and hasattr(param, IS_REPLICA_ZERO_PARALLEL) + and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True ): - accum_grad_obj.register_hook(reduce_grad_hook_sp) + accum_grad_obj.register_hook(extra_layernorm_reduce_grad_hook) # we should not only register for parameters which have _fstp_reduce_scatter_str attr. # we must keep up with reduce_grad_hook. @@ -373,9 +389,9 @@ def belongs_to_current_rank(self, param) -> bool: :return: True if the parameter should be updated by the current rank. Otherwise false. :rtype: bool """ - tensor_rank = self._param_store.get_param_rank(param) + tensor_ranks = self._param_store.get_param_rank(param) group_id = getattr(param, "group_id") - return tensor_rank == gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) + return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: for _param in bucket.get_param(reduce_rank): @@ -592,11 +608,25 @@ def _compute_norm_with_stage( ): # compute norm for gradients that have been reduced params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) + params_is_padding = False if len(params) == 0: + params_is_padding = True dtype = self.param_groups[group_id]["dtype"] grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] + if group_id == 0: + for param in params: + if self.use_isp: + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) + else: + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + elif group_id == 1: + for param in params: + setattr(param, IS_TENSOR_DATA_PARALLEL, True) + else: + raise NotImplementedError("group_id > 1 is not yet implemented.") + norm = 0 if self._clip_grad_norm > 0: # this norm is before scaling, it will be very large @@ -608,6 +638,17 @@ def _compute_norm_with_stage( zero_mode=self._broadcast_parallel_mode[group_id], ) + if params_is_padding: + for param in params: + if hasattr(param, IS_REPLICA_ZERO_PARALLEL): + delattr(param, IS_REPLICA_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_DATA_PARALLEL): + delattr(param, IS_TENSOR_DATA_PARALLEL) + if hasattr(param, IS_TENSOR_ZERO_PARALLEL): + delattr(param, IS_TENSOR_ZERO_PARALLEL) + if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): + delattr(param, IS_WEIGHT_ZERO_PARALLEL) + return norm def _compute_param_norm_stage( diff --git a/internlm/solver/optimizer/hybrid_zero_optim2.py b/internlm/solver/optimizer/hybrid_zero_optim2.py deleted file mode 100644 index fbfa20cd..00000000 --- a/internlm/solver/optimizer/hybrid_zero_optim2.py +++ /dev/null @@ -1,1018 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import math -from functools import partial -from typing import List, Optional - -import torch -import torch.distributed as dist -from torch.optim import Optimizer - -from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode -from internlm.core.context import global_context as gpc -from internlm.core.context.parallel_context import IS_SEQUENCE_DATA_PARALLEL -from internlm.monitor import send_alert_message -from internlm.solver.optimizer.store import ( - BucketStore, - GradientStore, - ParameterStore, - TensorBucket, -) -from internlm.solver.optimizer.utils import ( - DynamicGradScaler, - ParamBcastSyncHandler, - flatten, - get_grad_accumulate_object, - has_inf_or_nan, - reduce_tensor, - release_param_grad, - split_half_float_double, - sync_param, -) -from internlm.utils.common import get_current_device -from internlm.utils.logger import get_logger -from internlm.utils.megatron_timers import megatron_timer as timer -from internlm.utils.timeout import llm_timeout - -from .base_optimizer import BaseOptimizer -from .utils import compute_layer_norm, compute_norm, compute_param_norm - -inf = math.inf -logger = get_logger(__file__) - - -class HybridZeroOptimizer2(BaseOptimizer): - """ - Hybrid Zero Optimizer. - """ - - def __init__( - self, - optimizer: Optimizer, - cpu_offload=False, - grad_scal_cfg: Config = None, - zero_cfg: Config = None, - param_bcast_sync_handler: ParamBcastSyncHandler = None, - ): - # DynamicGradScaler related args - if gpc.config.model.dtype is torch.float32: - initial_scale = 1 - else: - initial_scale = grad_scal_cfg.fp16.initial_scale - min_scale = grad_scal_cfg.fp16.min_scale - growth_interval = grad_scal_cfg.fp16.growth_interval - growth_factor = grad_scal_cfg.growth_factor - backoff_factor = grad_scal_cfg.backoff_factor - hysteresis = grad_scal_cfg.hysteresis - max_scale = grad_scal_cfg.max_scale - - # Zero related args - reduce_bucket_size = zero_cfg.reduce_bucket_size - clip_grad_norm = zero_cfg.clip_grad_norm - self._overlap_sync_grad = zero_cfg.overlap_sync_grad - self._overlap_sync_param = zero_cfg.overlap_sync_param - - super().__init__(optim=optimizer) - - self._cpu_offload = cpu_offload - self._zero_local_rank = [] - self._zero_world_size = [] - self._broadcast_parallel_mode = [] - - # ParameterStore will manage the tensor buffers used for zero - # it will not manage the tensors used by mixed precision training - self._param_store = ParameterStore(ParallelMode.ZERO1) - self._grad_store = GradientStore(ParallelMode.WEIGHT_DATA) - self._bucket_store: List[BucketStore] = [] - self._accum_grad_buckets: List[BucketStore] = [] - self._bucket_in_progress = [] - - # fp16 and fp32 params for mixed precision training - self._fp16_param_groups = dict() - self._fp32_flat_param_groups_of_current_rank = dict() - - # communication params - # self._overlap_communication = overlap_communication - self._reduce_bucket_size = reduce_bucket_size - - self._comm_bcast_stream = torch.cuda.Stream() - - # gradient scaler - self.grad_scaler = DynamicGradScaler( - initial_scale=initial_scale, - min_scale=min_scale, - growth_factor=growth_factor, - backoff_factor=backoff_factor, - growth_interval=growth_interval, - hysteresis=hysteresis, - max_scale=max_scale, - ) - self._found_overflow = torch.cuda.FloatTensor([0], device=get_current_device()) - - # gradient clipping - self._clip_grad_norm = clip_grad_norm - - # need to record the rank in which parameter groups are not assigned parameters. - self.param_group_has_params = [] - self.param_group_no_params_ranks = [] - self.padding_grad = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device()) - self.padding_tensor = torch.zeros([32], dtype=gpc.config.model.dtype, device=get_current_device()) - - self.rank_unique_id = ( - f"gpus-{gpc.get_world_size(ParallelMode.GLOBAL)}_" - + f"wp-{gpc.get_local_rank(ParallelMode.WEIGHT)}_" - + f"sp-{gpc.get_local_rank(ParallelMode.SEQUENCE)}_" - + f"dp-{gpc.get_local_rank(ParallelMode.DATA)}_" - + f"wdp-{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}_" - + f"zo-{gpc.get_local_rank(ParallelMode.ZERO1)}.pt" - ) - self.params_per_rank_id_dict = [] - self._param_bcast_sync_handler = param_bcast_sync_handler - if self._overlap_sync_param: - assert self._param_bcast_sync_handler is not None - - if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: - self._fstp_handler = gpc.fstp_handler - else: - self._fstp_handler = None - - # iterate over the param group in the optimizer - # partition these param groups for data parallel training - # and add buffers to parameter store for future access - for group_id, param_group in enumerate(self.optim.param_groups): - group_params = param_group["params"] - - # set the dtype for each param group - param_group["dtype"] = group_params[0].dtype if len(group_params) != 0 else None - - # add the fp16 params to fp16_param_groups for bookkeeping - self._fp16_param_groups[group_id] = group_params - - # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1 - # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode - # zero_mode = ( - # ParallelMode.ZERO1 - # if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA - # else ParallelMode.EXPERT_DATA - # ) - zero_mode = param_group["optimizer_mode"] - - self._zero_local_rank.append(gpc.get_local_rank(zero_mode)) - self._zero_world_size.append(gpc.get_world_size(zero_mode)) - # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name - self._broadcast_parallel_mode.append(zero_mode) - - grad_reduce_mode = ParallelMode.WEIGHT_DATA - if param_group["name"] == "embed_head": - grad_reduce_mode = ParallelMode.DATA - - self._bucket_store.append(BucketStore(group_id, grad_reduce_mode)) - self._accum_grad_buckets.append(BucketStore(group_id, grad_reduce_mode)) - - # assign parameters to ranks the params in the list are sorted - params_per_rank, no_params_ranks = self._partition_param_list(group_id, param_group) - self.param_group_no_params_ranks.append(no_params_ranks) - self.param_group_has_params.append(self._zero_local_rank[group_id] not in no_params_ranks) - - # store the mapping between param to rank each param should belong to only one rank. - # we can skip the moe param and do not keep them in _param_store to save memory - # (means we need to deal with moe param in a different way), but it will increase - # complexity and reduce code readablity. - for rank, params in enumerate(params_per_rank): - # check whether any rank is not assigned params. - if len(params) != 0: - self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params) - for param in params: - setattr(param, "group_id", group_id) - self._param_store.set_param_to_rank(param, rank) - - # move to cpu to make room to create the flat tensor - for param in group_params: - param.data = param.data.cpu() - - # flatten the reordered tensors - # if param_group["name"] == "embed_head": - # tensor_list = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) - # with torch.no_grad(): - # flat_tensor = flatten(tensor_list) - # flat_tensor = flat_tensor.data.cuda() - # sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) - # # for rank in range(self._zero_world_size[group_id]): - # self._param_store.add_flat_fp16_param_by_rank_group( - # self._zero_local_rank[group_id], group_id, flat_tensor - # ) - # else: - for rank in range(self._zero_world_size[group_id]): - # No flat fp16 buffer is allocated if the process has no parameters. - if rank not in self.param_group_no_params_ranks[group_id]: - tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id) - with torch.no_grad(): - flat_tensor = flatten(tensor_list) - flat_tensor = flat_tensor.data.cuda() - self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor) - sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list) - - # create a copy of fp32 weights of the parameters for which this rank is responsible - # No flat fp32 buffer is allocated if the process has no parameters. - if self.param_group_has_params[group_id]: - fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group( - self._zero_local_rank[group_id], group_id - ) - fp32_flat_current_rank = fp16_flat_current_rank.float() - device = "cpu" if self._cpu_offload else get_current_device() - fp32_flat_current_rank = fp32_flat_current_rank.to(device) - fp32_flat_current_rank.requires_grad = True - self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank - - # need to replace the params in the `params` field in the optimizer - # so that when the optimizer calls step(), it only updates the tensors - # managed by this data parallel rank - param_group["params"] = [fp32_flat_current_rank] - - # set reduction state - for param in self._fp16_param_groups[group_id]: - self._param_store.set_param_reduction_state(param, False) - - assert len(self._fp16_param_groups) != 0 - - # If a rank is not assigned any arguments, 'has_params' is False. - self.has_params = sum(self.param_group_has_params) != 0 - # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled. - self.skip_grad_reduce = False - - self._attach_reduction_hook() - - @property - def zero_local_rank(self): - return self._zero_local_rank - - @property - def zero_world_size(self): - return self._zero_world_size - - @property - def loss_scale(self): - return self.grad_scaler.scale - - @property - def num_param_groups(self): - return len(self._fp16_param_groups) - - def _partition_param_list(self, group_id, param_group): - no_params_ranks = [] - # if param_group["name"] == "embed_head": - # params_per_rank = [param_group["params"] for _ in range(self._zero_world_size[group_id])] - # return params_per_rank, set(no_params_ranks) - - params_per_rank = [[] for _ in range(self._zero_world_size[group_id])] - numel_per_rank = [0 for _ in range(self._zero_world_size[group_id])] - self.params_per_rank_id_dict.append([[] for _ in range(self._zero_world_size[group_id])]) - param_list = param_group["params"] - - sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True) - for i, param in enumerate(sorted_params): - global_id = str(i) - for j in range(len(param.size())): - global_id = "_".join([global_id, str(param.size()[j])]) - if self._overlap_sync_param: - rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param) - else: - rank_to_go = numel_per_rank.index(min(numel_per_rank)) - params_per_rank[rank_to_go].append(param) - self.params_per_rank_id_dict[-1][rank_to_go].append(global_id) - numel_per_rank[rank_to_go] += param.numel() - - # check whether any rank is not assigned to parameters. - for rank, params in enumerate(params_per_rank): - if len(params) == 0: - no_params_ranks.append(rank) - - if gpc.is_rank_for_log(): - logger.info( # pylint: disable=W1203 - f"Number of elements on ranks: {numel_per_rank}, rank:{gpc.get_global_rank()}" - ) - - return params_per_rank, set(no_params_ranks) - - def _is_moe_group(self, param_group): - return "moe" in param_group.keys() and param_group["moe"] - - def _is_norm_group(self, param_group): - return "norm" in param_group.keys() and param_group["norm"] - - def _is_gate_group(self, param_group): - return "gate" in param_group.keys() and param_group["gate"] - - # TODO check expert dp is correct when enable moe and overlap both - def _attach_reduction_hook(self): - # we iterate over the fp16 params - # on each param, we register a hook to its AccumulateGrad object - for group_id in range(self.num_param_groups): - param_group = self._fp16_param_groups[group_id] - for param in param_group: - # we should not reduce the param in moe - if not param.requires_grad: - continue - - reduce_rank = None - - def _define_and_attach(param, reduce_rank=None): - reduction_func = partial( - self._store_and_try_reduce_grads_by_bucket, - param=param, - reduce_rank=reduce_rank, - ) - - reduce_scatter_checker = partial( - self._wait_reduce_scatter_and_accumulate_grads, - param=param, - reduce_rank=reduce_rank, - ) - - def reduction_sp_func(): - handle = reduce_tensor( - param.grad, - dtype=None, - dst_rank=reduce_rank, - parallel_mode=ParallelMode.WEIGHT, - ) - handle.wait() - - # define hook - # NOT IMPORTANT BUT GOOD TO KNOW: - # args here is not grad, but allow_unreacable and accumulate_grad - def reduce_grad_hook(*args): # pylint: disable=W0613 - if self.skip_grad_reduce is False: - reduction_func() - - # define hook for real gradient accumulation. - def accum_grad_hook(*args): # pylint: disable=W0613 - reduce_scatter_checker() - - # define hook for sequence_parallel - def reduce_grad_hook_sp(*args): # pylint: disable=W0613 - if self.skip_grad_reduce is False: - reduction_sp_func() - - # get the AccumulateGrad object of the param itself - # If these objects are not kept, reduction hooks may not be attached successfully. - accum_grad_obj = get_grad_accumulate_object(param) - self._grad_store.add_accumulate_grad_object(accum_grad_obj) - - # if sequence_parallel is True, - # the grad of norm should be all-reduce across the tp process group - # if ( - # gpc.config.parallel.sequence_parallel is True - # and hasattr(param, IS_SEQUENCE_PARALLEL) - # and getattr(param, IS_SEQUENCE_PARALLEL) is True - # ): - # accum_grad_obj.register_hook(reduce_grad_hook_sp) - - if ( - gpc.config.parallel.weight.size > 1 - and hasattr(param, IS_REPLICA_ZERO_PARALLEL) - and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True - ): - accum_grad_obj.register_hook(reduce_grad_hook_sp) - - # we should not only register for parameters which have _fstp_reduce_scatter_str attr. - # we must keep up with reduce_grad_hook. - if self._fstp_handler is not None: - accum_grad_obj.register_hook(accum_grad_hook) - - if self._overlap_sync_grad: - accum_grad_obj.register_hook(reduce_grad_hook) - - _define_and_attach(param, reduce_rank) - - def accumulate_left_grads_after_backward(self): - if self._fstp_handler is None: - return - - for group_id in range(self.num_param_groups): - self._accum_grads_store_in_bucket(self._accum_grad_buckets[group_id]) - - def belongs_to_current_rank(self, param) -> bool: - """ - Check whether a parameter is supposed to be updated by the process of the current rank - - :param tensor: A :class:`torch.Tensor` object - :type tensor: torch.Tensor - - :return: True if the parameter should be updated by the current rank. Otherwise false. - :rtype: bool - """ - tensor_ranks = self._param_store.get_param_rank(param) - group_id = getattr(param, "group_id") - return gpc.get_local_rank(self._broadcast_parallel_mode[group_id]) in tensor_ranks - - def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: - for _param in bucket.get_param(reduce_rank): - if not hasattr(_param, "_fstp_reduce_scatter_str"): - continue - - # wait and accumulate gardient. - _key = getattr(_param, "_fstp_reduce_scatter_str") - _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key] - _comm_handle.wait() - _param.grad.add_(_grad) - - # release cuda memory. - if self._fstp_handler.enable_memory_pool: - self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) - _grad = None - self._fstp_handler.reduce_scatter_handlers[_key] = None - - bucket.reset_by_rank(reduce_rank) - - def _wait_reduce_scatter_and_accumulate_grads(self, param, reduce_rank: Optional[int] = None): - param_size = param.numel() - - group_id = getattr(param, "group_id") - current_bucket = self._accum_grad_buckets[group_id] - - # check if the bucket is full - # if full, will reduce the grads already in the bucket - # after reduction, the bucket will be empty - if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: - self._accum_grads_store_in_bucket(current_bucket, reduce_rank) - - # otherwise, add the parameter into bucket. - current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) - current_bucket.add_param(param, reduce_rank) - - def _store_and_try_reduce_grads_by_bucket(self, param, reduce_rank=None): - param_size = param.numel() - - # check if the bucket is full - # if full, will reduce the grads already in the bucket - # after reduction, the bucket will be empty - group_id = getattr(param, "group_id") - current_bucket = self._bucket_store[group_id] - - if current_bucket.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: - self._reduce_grads_stored_in_bucket(current_bucket, reduce_rank, last_bucket=False) - - # the param must not be reduced to ensure correctness - is_param_reduced = self._param_store.is_param_reduced(param) - if is_param_reduced: - msg = ( - f"Parameter of size ({param.size()}) has already been reduced, " - + "duplicate reduction will lead to arithmetic incorrectness" - ) - raise RuntimeError(msg) - - # the param must have grad for reduction - assert param.grad is not None, f"Parameter of size ({param.size()}) has None grad, cannot be reduced" - - current_bucket.add_num_elements_in_bucket(param_size, reduce_rank) - current_bucket.add_grad(param.grad, reduce_rank) - current_bucket.add_param(param, reduce_rank) - - def _reduce_grads_stored_in_bucket(self, current_bucket, reduce_rank=None, last_bucket=False): - # reduce grads - self._reduce_grads_by_rank( - reduce_rank=reduce_rank, - grads=current_bucket.get_grad(reduce_rank=reduce_rank), - bucket_size=current_bucket.num_elements_in_bucket(reduce_rank), - group_id=current_bucket.get_param_group_id(), - dp_parallel_mode=current_bucket.get_dp_parallel_mode(), - ) - - params_in_bucket = current_bucket.get_param(reduce_rank=reduce_rank) - - for param in params_in_bucket: - # the is_param_reduced flag should be False showing that - # this param is not reduced before calling self._reduce_grads_by_rank - is_param_reduced = self._param_store.is_param_reduced(param) - - if is_param_reduced: - msg = ( - f"Parameter of size ({param.size()}) has been reduced, " - + "duplicate reduction will lead to arithmetic incorrectness" - ) - raise RuntimeError(msg) - - # update the flag - self._param_store.set_param_reduction_state(param, True) - - if self.belongs_to_current_rank(param): - self._param_store.add_reduced_param_for_compute_norm(param, last_bucket) - else: - self._param_store.add_previous_reduced_param(param) - - current_bucket.reset_by_rank(reduce_rank) - - def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size, group_id, dp_parallel_mode): - grad_buckets_by_dtype = split_half_float_double(grads) - next_bucket_list = [] - # add parameters into bucket for reduction - for tensor_list in grad_buckets_by_dtype: - param_bucket = TensorBucket(size=bucket_size) - for tensor in tensor_list: - param_bucket.add_to_bucket(tensor, allow_oversize=True) - if not param_bucket.is_empty(): - self._reduce_and_copy( - bucket=param_bucket, reduce_rank=reduce_rank, group_id=group_id, dp_parallel_mode=dp_parallel_mode - ) - next_bucket_list.append(param_bucket) - - # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy() - # here we can also overlap the communication with some memcpy operation caused by bucket.flatten() - for bucket in self._bucket_in_progress: - bucket.commu_handle.wait() - bucket.unflatten_and_copy() - bucket.empty() - self._bucket_in_progress = [] - self._param_store.clear_grads_of_previous_reduced_params() - - # after the completion of bucket list reduction, add new buckets into _bucket_in_progress - self._bucket_in_progress = next_bucket_list.copy() - - def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank, group_id, dp_parallel_mode): - # flatten the tensors and do allreduce - bucket.flatten() - bucket.commu_handle = reduce_tensor( - tensor=bucket.get_flat_tensor(), - dtype=None, - dst_rank=reduce_rank, - parallel_mode=dp_parallel_mode, - ) - - # update the reduced tensor - if reduce_rank is None or reduce_rank == self._zero_local_rank[group_id]: - bucket.set_unflatten_and_copy_flag(flag=True) - - def _has_inf_or_nan(self, tensor): - try: - tensor_mean = float(tensor.mean()) - except RuntimeError as instance: - # We want to check if inst is actually an overflow exception. - # RuntimeError could come from a different error. - # If so, we still want the exception to propagate. - if "value cannot be converted" not in instance.args[0]: - raise - return True - else: - if tensor_mean == float("inf") or tensor_mean == -float("inf"): - return True - return False - - def _sync_grad(self): - # update param already reduced flag - reduction_states = self._param_store.get_param_reduction_states() - for tensor, _ in reduction_states.items(): - reduction_states[tensor] = False - self._param_store.reset_reduced_data_for_compute_norm() - - # accumulate gradient - avg_gradients = self._grad_store._averaged_gradients - for group_id in range(self.num_param_groups): - # the following operations are performed only on the rank to which parameters are assigned. - if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: - param_group = self._param_store.get_fp16_params_by_rank_group(self._zero_local_rank[group_id], group_id) - - if group_id not in avg_gradients: - avg_gradients[group_id] = [] - - param_idx = 0 - for param in param_group: - if param.grad is not None: - if len(avg_gradients[group_id]) == param_idx: - avg_gradients[group_id].append(param.grad) - else: - avg_gradients[group_id][param_idx].add_(param.grad) - param_idx += 1 - - # the gradients needed are stored in the avg_gradients buffer - # thus, can clear this - self.zero_grad() - - def zero_grad(self, set_to_none=True): - """ - Set parameter gradients to zero. If set_to_none = True, gradient - will be set to None to save memory. - - :param set_to_none: Whether set the gradient to None. Default value is True. - :type set_to_none: bool - """ - for _, param_group in self._fp16_param_groups.items(): - for param in param_group: - if set_to_none: - param.grad = None - elif param.grad is not None: - param.grad.detach() - param.grad.zero_() - else: - pass - - def backward(self, loss, retain_graph=False): - loss = self.loss_scale * loss - loss.backward(retain_graph=retain_graph) - - # Gradients may not be fully synchronized here. - - def _compute_norm_with_stage( - self, - group_id: int = 0, - last_bucket: bool = False, - last_stage: bool = False, - previous_norm=None, - ): - # compute norm for gradients that have been reduced - params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) - if len(params) == 0: - dtype = self.param_groups[group_id]["dtype"] - grads = [self.padding_grad.to(dtype)] - params = [self.padding_tensor.to(dtype)] - - norm = 0 - if self._clip_grad_norm > 0: - # this norm is before scaling, it will be very large - norm = compute_norm( - gradients=grads, - parameters=params, - last_stage=last_stage, - previous_norm=previous_norm, - zero_mode=self._broadcast_parallel_mode[group_id], - ) - - return norm - - def _compute_param_norm_stage( - self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_param_norms=None - ): - # compute norm for gradients that have been reduced - params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) - - total_param_norms = {} - if len(params) == 0: - dtype = self.param_groups[group_id]["dtype"] - grads = [self.padding_grad.to(dtype)] - params = [self.padding_tensor.to(dtype)] - - if self._clip_grad_norm > 0: - total_param_norms = compute_param_norm( - grads, - params, - last_stage=last_stage, - previous_param_norms=previous_param_norms, - zero_mode=self._broadcast_parallel_mode[group_id], - is_moe_group=self._is_moe_group(self.optim.param_groups[group_id]), - ) - return total_param_norms - - @llm_timeout(func_name="optim_step") - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (Callable, optional): A closure that reevaluates the model - and returns the loss. - Returns: - Union[bool, float]: Whether the gradient is success updated, and the gradient. - """ - assert closure is None, "closure is not supported by step()" - - # import pdb - - # if gpc.get_global_rank() == 0: - # pdb.set_trace() - - # if not overlapping communication (no reduction hook is attached) - # we need to manually reduce these gradients - if not self._overlap_sync_grad: - for group_id in range(len(self._fp16_param_groups)): - for param in self._fp16_param_groups[group_id]: - # we should not reduce the param in moe - if param.grad is not None: - self._store_and_try_reduce_grads_by_bucket(param) - - # we need to reduce the gradients left in the communication bucket - for group_id in range(self.num_param_groups): - self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True) - - # compute norm for gradients in the before bucket - groups_norms = [] - groups_param_norms = [] - for group_id in range(self.num_param_groups): - groups_norms.append(self._compute_norm_with_stage(group_id=group_id)) - if gpc.config.get("grad_norm_profiling", False): - groups_param_norms.append(self._compute_param_norm_stage(group_id=group_id)) - - # clear reduced grads - # grads in the last bucket is reduced - for bucket in self._bucket_in_progress: - bucket.commu_handle.wait() - bucket.unflatten_and_copy() - bucket.empty() - self._bucket_in_progress = [] - self._param_store.clear_grads_of_previous_reduced_params() - # compute norm for gradients in the last bucket - total_norms = {} - total_param_norms = {} - total_layer_norms = {} - for group_id in range(self.num_param_groups): - group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default" - group_name = f"{group_id}_{group_name}" - total_norms[group_name] = self._compute_norm_with_stage( - group_id=group_id, - last_bucket=True, - last_stage=True, - previous_norm=groups_norms[group_id], - ) - if gpc.config.get("grad_norm_profiling", False): - param_norms = self._compute_param_norm_stage( - group_id=group_id, - last_bucket=True, - last_stage=True, - previous_param_norms=groups_param_norms[group_id], - ) - total_layer_norms[group_name], total_param_norms[group_name] = compute_layer_norm( - param_norms=param_norms, loss_scale=self.loss_scale.item() - ) - - # Need to allreduce(avg) the norms across different ranks because moe params will not be synced - # during allreduce - if self._is_moe_group(self.optim.param_groups[group_id]): - # model and zero have been reduced!!! - pg = gpc.get_group(ParallelMode.EXPERT) - scaled_norm = total_norms[group_name] * 1.0 / float(gpc.get_world_size(ParallelMode.EXPERT)) - scaled_norm_tensor = torch.tensor(scaled_norm, device=get_current_device(), dtype=torch.float) - dist.all_reduce(scaled_norm_tensor, group=pg) - total_norms[group_name] = scaled_norm_tensor.item() - timer("sync_grad").start() - self._sync_grad() - timer("sync_grad").stop() - - state, global_norms = self._step(closure=closure, norms=total_norms) - if gpc.config.get("grad_norm_profiling", False): - global_norms["layer_norms"] = total_layer_norms - global_norms["param_norms"] = total_param_norms - - return state, global_norms - - def _step(self, closure=None, norms=None): - assert closure is None, "closure is not supported by step()" - - # check for overflow - found_inf = False - found_nan = False - # if there is INF values in grades, compute_norm func would also returns -1 - # thus, we try to avoid call _check_overflow here - # found_inf = self._check_overflow() - # Because you may encounter inf when computing norm - - if -1 in norms.values(): - found_inf = True - - if -2 in norms.values(): - found_nan = True - - loss_scale = float(self.loss_scale.item()) # backup - if gpc.config.model.dtype is not torch.float32: - self.grad_scaler.update(found_inf) - - # update loss scale if overflow occurs - if found_inf: - if gpc.is_rank_for_log(): - logger.warning("Overflow occurs, please check it.") - send_alert_message( - address=gpc.config.monitor.alert.feishu_alert_address, - message="Overflow occurs, please check it.", - ) - self._grad_store._averaged_gradients = dict() - self.zero_grad() - return False, norms - - if found_nan: - if gpc.is_rank_for_log(): - logger.warning("Nan grad norm occurs, please check it.") - send_alert_message( - address=gpc.config.monitor.alert.feishu_alert_address, - message="Nan grad norm occurs, please check it.", - ) - self._grad_store._averaged_gradients = dict() - self.zero_grad() - return False, norms - # copy the grad of fp16 param to fp32 param - single_grad_partition_groups = [] - for group_id in range(self.num_param_groups): - # compute norm - # The following operations are performed only on the rank to which parameters are assigned. - if not self.param_group_has_params[group_id]: - continue - - # create flat gradient for the flat fp32 params - gradients = self._grad_store.get_averaged_gradients_by_group(group_id) - with torch.no_grad(): - flat_fp16_avg_grads = flatten(gradients) - self._grad_store.reset_average_gradients_by_group(group_id) - gradients = None # release cuda memory - - dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype - flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) - flat_fp16_avg_grads = None # release cuda memory - - param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape - assert ( - param_shape == flat_fp32_avg_grads.shape - ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}" - - # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients. - # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors. - is_tp_sync_groups = ( - self._is_norm_group(self.optim.param_groups[group_id]), - self._is_gate_group(self.optim.param_groups[group_id]), - ) - if any(is_tp_sync_groups): - dist.all_reduce( - flat_fp32_avg_grads, - op=dist.ReduceOp.AVG, - group=gpc.get_group(ParallelMode.TENSOR), - ) - - single_grad_partition_groups.append(flat_fp32_avg_grads) - device = self._fp32_flat_param_groups_of_current_rank[group_id].device - self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device) - # unscale and clip grads - # get the global norm - global_norm_groups = {} - if self._clip_grad_norm > 0: - for group_name, norm in norms.items(): - global_norm_groups[group_name] = norm**0.5 - - # the following operations are performed only on the rank to which parameters are assigned. - if gpc.config.model.dtype is not torch.float32: - if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0: - self._unscale_and_clip_grads( - single_grad_partition_groups, - list(global_norm_groups.values()), - loss_scale, - ) - - # update the parameters - timer("step").start() - - # For those ranks that are not assigned parameters, we just wait for other ranks - # to send them updated their own parameters. - if self.has_params: - self.optim.step() - # release the fp32 grad - release_param_grad(self._fp32_flat_param_groups_of_current_rank.values()) - # update fp16 partition updated by the current rank - for group_id in range(len(self._fp16_param_groups)): - if self.param_group_has_params[group_id]: - fp16_param = self._param_store.get_flat_fp16_param_by_rank_group( - rank=self._zero_local_rank[group_id], group_id=group_id - ) - fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] - fp16_param.data.copy_(fp32_param) - torch.cuda.synchronize() - with torch.cuda.stream(self._comm_bcast_stream): - self.broadcast_params() - - timer("step").stop() - - # update gradients may not be needed here, because the sync_params function is used in initialization, - # so synchronization is maintained - for group_name, global_norm in global_norm_groups.items(): - global_norm_groups[group_name] = global_norm / loss_scale - return True, global_norm_groups - - def broadcast_params(self): - handles = [] - - for group_id in range(self.num_param_groups): - # if self.param_groups[group_id]["name"] == "embed_head": - # continue - for rank in range(self._zero_world_size[group_id]): - # The following operations are performed only on the rank to which parameters are assigned. - if rank in self.param_group_no_params_ranks[group_id]: - continue - fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id) - # grank = gpc.get_ranks_in_group(group_type)[rank] # need to convert to the global rank - # assert grank == rank, f"{grank} == {rank}" - g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank] - handle = dist.broadcast( - fp16_param, - src=g_rank, - group=gpc.get_group(self._broadcast_parallel_mode[group_id]), - async_op=True, - ) - - if self._overlap_sync_param: - self._param_bcast_sync_handler.add_bcast_handle(rank, handle) - else: - handles.append(handle) - - for handle in handles: - handle.wait() - - torch.cuda.synchronize() - - ################## - # FP16 Utilities # - ################## - - def _check_overflow(self): - # clear previous overflow record - self._found_overflow.fill_(0.0) - - # check for overflow - for group_id in range(len(self._fp16_param_groups)): - # The following operations are performed only on the rank to which parameters are assigned. - if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: - for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id): - if avg_grad is not None and has_inf_or_nan(avg_grad): - self._found_overflow.fill_(1.0) - break - dist.all_reduce( - self._found_overflow, - op=dist.ReduceOp.MAX, - group=gpc.get_group(ParallelMode.GLOBAL), - ) - - return self._found_overflow.item() > 0 - - def _unscale_and_clip_grads(self, grad_groups_flat, total_norm_groups, loss_scale): - # compute combined scale factor for this group - combined_scale_groups = [] - - if self._clip_grad_norm > 0.0: - # norm is in fact norm*scale - for group_id, total_norm in enumerate(total_norm_groups): - combined_scale_groups.append(loss_scale) - clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm - if clip > 1.0: - combined_scale_groups[group_id] = clip * loss_scale - - for group_id, grad in enumerate(grad_groups_flat): - grad.data.mul_(1.0 / combined_scale_groups[group_id]) - - def clip_grad_norm(self, model, max_norm): - # will conduct in the step() - pass - - def state_dict(self): - states = {} - grad_scaler = self.grad_scaler.state_dict() - states["grad_scaler"] = grad_scaler - optim_states = self.optim.state_dict() - states["base_optim_states"] = optim_states - - flat_fp32_weights = {} - for group_id, param in self._fp32_flat_param_groups_of_current_rank.items(): - if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: - assert param.grad is None - flat_fp32_weights[group_id] = param - states["flat_fp32_weights"] = flat_fp32_weights - states["zero_devide_optim_plan"] = self.params_per_rank_id_dict - - return states - - def load_state_dict(self, states): - # TODO: Need to take into account the change in the number of DP. - assert "grad_scaler" in states, "Not found grad_scaler state!" - grad_scaler = states["grad_scaler"] - self.grad_scaler.load_state_dict(grad_scaler) - optim_states = states["base_optim_states"] - self.optim.load_state_dict(optim_states) - - # load fp32 model weight. - flat_fp32_weights = states["flat_fp32_weights"] - assert set(flat_fp32_weights.keys()) == set(self._fp32_flat_param_groups_of_current_rank) - for group_id, param in flat_fp32_weights.items(): - if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: - self_param = self._fp32_flat_param_groups_of_current_rank[group_id] - assert ( - self_param.shape == param.shape - ), f"The loaded parameter shape is inconsistent, {self_param.shape} != {param.shape}" - self_param.data.copy_(param.data) - - # Load the fp16 model weights. - for group_id in range(len(self._fp16_param_groups)): - if self._zero_local_rank[group_id] not in self.param_group_no_params_ranks[group_id]: - fp16_param = self._param_store.get_flat_fp16_param_by_rank_group( - rank=self._zero_local_rank[group_id], group_id=group_id - ) - fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id] - fp16_param.data.copy_(fp32_param) - - if "zero_devide_optim_plan" in states: - self.params_per_rank_id_dict = states["zero_devide_optim_plan"] - - -def reload_zero_fp32_buff(optimizer): - # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value. - # Or we must ensure that loading model weights must be done before zero is initialized. - if isinstance(optimizer, HybridZeroOptimizer): - for group_id, param_group in enumerate(optimizer.optim.param_groups): - if optimizer.param_group_has_params[group_id]: - # flatten fp16 params have already been updated by 'load_model_checkpoint' - fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group( - optimizer._zero_local_rank[group_id], group_id - ) - # param_group["params"] is fp32 flatten optimizer states of this zero rank. - param_group["params"][0].data.copy_(fp16_flat_current_rank.float()) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 7e760b85..42a9949f 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -14,18 +14,13 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.core.context.parallel_context import ( - IS_REPLICA_ZERO_PARALLEL, - IS_SEQUENCE_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, -) from internlm.core.naive_amp import NaiveAMPModel from internlm.utils.common import get_current_device, get_tensor_norm, move_norm_to_cuda from internlm.utils.logger import get_logger from internlm.utils.parallel import ( - is_model_parallel_parameter, is_replica_zero_parallel_parameter, - is_sequence_data_parallel_parameter, + is_tensor_data_parallel_parameter, + is_tensor_zero_parallel_parameter, is_weight_zero_parallel_parameter, ) @@ -219,7 +214,7 @@ def calc_lp(grads, norm_type): return norm -def reduce_grads(gradients, parameters, fine_grained=False): +def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False): parallel_grads = [] if fine_grained: parallel_grads = {} @@ -248,24 +243,19 @@ def append_grad(g, p): ): continue elif ( - gpc.is_initialized(ParallelMode.TENSOR) - and not is_model_parallel_parameter(p) - and gpc.get_local_rank(ParallelMode.TENSOR) == 0 - ): # if not used in each chunk, such as layernorm - append_grad(g, p) - elif ( - is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(ParallelMode.WEIGHT) == 0 + is_replica_zero_parallel_parameter(p) and gpc.get_local_rank(weight_parallel_mode) == 0 ): # if not used in each chunk, such as layernorm IS_REPLICA_ZERO_PARALLEL parameter group append_grad(g, p) - elif is_sequence_data_parallel_parameter(p): - # process all ranks for IS_SEQUENCE_DATA_PARALLEL parameter group + elif is_tensor_data_parallel_parameter(p): + # process all ranks for IS_TENSOR_DATA_PARALLEL parameter group + append_grad(g, p) + elif is_tensor_zero_parallel_parameter(p): + # process all ranks for IS_TENSOR_ZERO_PARALLEL parameter group append_grad(g, p) elif is_weight_zero_parallel_parameter(p): # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group append_grad(g, p) - elif is_model_parallel_parameter(p): - append_grad(g, p) - elif gpc.get_local_rank(ParallelMode.TENSOR) != 0: + elif gpc.get_local_rank(weight_parallel_mode) != 0: continue else: raise RuntimeError("Should not arrive here") @@ -286,6 +276,7 @@ def compute_norm( Total norm of the parameters, need total_norm**(1/norm) before using. """ + weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR enable_cuda_kernels = gradients[0].device.type == "cuda" # Norm parameters. norm_type = float(norm_type) @@ -310,7 +301,7 @@ def compute_norm( ) total_norm = total_norm_cuda[0].item() else: - tensor_parallel_grads = reduce_grads(gradients, parameters) + tensor_parallel_grads = reduce_grads(gradients, parameters, weight_parallel_mode) if norm_type == 2.0 and enable_cuda_kernels: tensor_parallel_norm = calc_l2_norm(tensor_parallel_grads) ** norm_type @@ -331,17 +322,29 @@ def compute_norm( if previous_norm is not None: total_norm = total_norm + previous_norm - # Sum across all model-parallel GPUs. - if hasattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL) and getattr(parameters[0], IS_SEQUENCE_DATA_PARALLEL): - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.SEQUENCE)) + """ + Sum across all model-parallel GPUs. + 1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and + gradients along the pp+zero dimensions from all ranks should be aggregated. + 2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated. + 3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated. + 4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions from all ranks should be aggregated. + """ + if is_tensor_data_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR)) + elif is_tensor_zero_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR)) else: - if gpc.is_initialized(ParallelMode.WEIGHT): + if gpc.is_using_parallel_mode(weight_parallel_mode): dist.all_reduce( total_norm, op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.WEIGHT), + group=gpc.get_group(weight_parallel_mode), ) - if gpc.is_initialized(ParallelMode.PIPELINE): + + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): dist.all_reduce( total_norm, op=dist.ReduceOp.SUM, @@ -349,11 +352,8 @@ def compute_norm( ) # This is because we use zero1, so we need to use this reduction. - # TODO: Check zero group to be a subset of dp group. - # if (hasattr(parameters[0], IS_REPLICA_ZERO_PARALLEL) and getattr(parameters[0], IS_REPLICA_ZERO_PARALLEL)) or ( - # hasattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) and getattr(parameters[0], IS_WEIGHT_ZERO_PARALLEL) - # ): - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) + if gpc.is_using_parallel_mode(zero_mode): + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) if torch.is_tensor(total_norm): total_norm = total_norm.item() diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 8d786489..587c0035 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -20,7 +20,6 @@ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy from torch.utils.data import ConcatDataset, DataLoader -from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.context.random import set_mode from internlm.core.naive_amp import NaiveAMPModel @@ -40,6 +39,9 @@ FeedForward, RewardModelLinear, ScaleColumnParallelLinear, + BaseScaleColumnParallelLinear, + ColumnParallelLinear, + RowParallelLinear, ) from internlm.model.multi_head_attention import MHA from internlm.model.overlap_handler import FSTPOverlapHandler @@ -48,7 +50,7 @@ from internlm.monitor.monitor import monitor_manager as mm from internlm.solver.beta2_scheduler import Beta2Scheduler from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR -from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer, HybridZeroOptimizer2 +from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups from internlm.utils.common import DummyProfile @@ -57,16 +59,72 @@ from internlm.utils.parallel import ( set_model_params_layer_name, sync_model_param, - sync_model_param_within_tp, sync_model_replica_param_group, ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout +from internlm.core.context import ( + IS_TENSOR_ZERO_PARALLEL, + IS_REPLICA_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + ParallelMode, +) +from internlm.utils.parallel import ( + is_replica_zero_parallel_parameter, + is_tensor_data_parallel_parameter, + is_tensor_zero_parallel_parameter, + is_weight_zero_parallel_parameter, +) RMSNorm = try_import_RMSNorm() logger = get_logger(__file__) +def set_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]): + def _check_module(module): + # layer_norm + if isinstance(module, (RMSNorm, nn.LayerNorm)): + for param in module.parameters(): + setattr(param, IS_REPLICA_ZERO_PARALLEL, True) + + # embedding and head + if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance( + module, BaseScaleColumnParallelLinear + ): + for param in module.parameters(): + if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp": + setattr(param, IS_TENSOR_DATA_PARALLEL, True) + elif gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp": + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + + # for linear module + if isinstance(module, (ColumnParallelLinear, RowParallelLinear)): + for param in module.parameters(): + if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp": + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp": + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) + + if not isinstance(model, nn.ModuleList): + model = [model] + + for _chunk in model: + if isinstance(_chunk, NaiveAMPModel): + _chunk = _chunk.model + + for name, module in _chunk.named_modules(): + _check_module(module) + + for name, param in _chunk.named_parameters(): + assert ( + is_replica_zero_parallel_parameter(param) + or is_tensor_data_parallel_parameter(param) + or is_tensor_zero_parallel_parameter(param) + or is_weight_zero_parallel_parameter(param) + ), f"parameter with name:{name} has no parallel attribution." + + @llm_timeout(func_name="initialize_model") def initialize_model(): """ @@ -98,6 +156,8 @@ def initialize_model(): sync_buffer=False, ) + set_attr_for_param_groups(model) + # This sync is very important, cause the model weights kept in optimizer are copied # from the origin parameters in the memory, so we should make sure the dp sync # does not influence the model weights in optimizer be different with the origin parameters. @@ -105,19 +165,18 @@ def initialize_model(): # This function is needed to make sure parameters that are not splitted by tensor parallelism are # the same across tensor parallelism. - sync_model_param_within_tp(model) - sync_model_replica_param_group(model) # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. - set_mode(ParallelMode.WEIGHT_DATA) + random_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA + set_mode(random_mode) # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) gpc.fstp_handler = None - if gpc.config.parallel["weight"]["size"] >= 1 and gpc.config.parallel["weight"]["overlap"] is True: + if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT)) return model @@ -185,15 +244,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): eps=adam_cfg.adam_eps, ) - if gpc.config.parallel.weight.size > 1: - optimizer = HybridZeroOptimizer2( - naive_optimizer, - grad_scal_cfg=gpc.config.grad_scaler, - zero_cfg=gpc.config.hybrid_zero_optimizer, - param_bcast_sync_handler=param_bcast_sync_handler, - ) - logger.info("use HybridZeroOptimizer2 for new partition strategy...") - elif not gpc.config.parallel.zero1.fsdp: + if not gpc.config.parallel.zero1.fsdp: optimizer = HybridZeroOptimizer( naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 382c46d5..54e75ccb 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -2,9 +2,10 @@ import torch -from internlm.core.context.parallel_context import IS_REPLICA_ZERO_PARALLEL, IS_SEQUENCE_DATA_PARALLEL, ParallelMode +from internlm.core.context.parallel_context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param +from internlm.utils.parallel import is_tensor_data_parallel_parameter def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]: @@ -111,7 +112,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True) - # create new groups for IS_SEQUENCE_DATA_PARALLEL parameter group + # create new groups for IS_TENSOR_DATA_PARALLEL parameter group new_groups = {} new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1} @@ -126,7 +127,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # assign param origin_params = [] for param in pgroup["params"]: - if hasattr(param, IS_SEQUENCE_DATA_PARALLEL) and getattr(param, IS_SEQUENCE_DATA_PARALLEL) is True: + if is_tensor_data_parallel_parameter(param): new_groups["embed_head"]["params"].append(param) # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True: # new_groups["layer_norm"]["params"].append(param) diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index c6e27a68..a7c790b3 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -11,7 +11,7 @@ @contextmanager def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum_batch_size, metric_hook_list): - if not gpc.is_using_pp(): + if not gpc.is_using_parallel_mode(ParallelMode.PIPELINE): prev_data_process_func = trainer.schedule.data_process_func prev_grad_accum_size = trainer.schedule._grad_accum_size prev_grad_accum_batch_size = trainer.schedule._grad_accum_batch_size @@ -31,7 +31,7 @@ def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum @contextmanager def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape, metric_hook_list): - if gpc.is_using_pp(): + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): pre_data_process_func = trainer.schedule.data_process_func prev_num_microbatches = trainer.schedule.num_microbatches prev_tensor_shape = trainer.schedule.tensor_shape @@ -101,7 +101,7 @@ def evaluate_on_val_dls( ): moe_loss = None with torch.inference_mode(): - if gpc.is_using_pp(): + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): total_val_bsz = len(batch[1]) assert total_val_bsz % data_cfg.micro_bsz == 0 num_microbatches = total_val_bsz // data_cfg.micro_bsz diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 966332a1..e354f3b2 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -5,9 +5,9 @@ from torch import nn from internlm.core.context import ( - IS_TENSOR_PARALLEL, IS_REPLICA_ZERO_PARALLEL, - IS_SEQUENCE_DATA_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, IS_WEIGHT_ZERO_PARALLEL, ParallelMode, ) @@ -15,25 +15,32 @@ from internlm.core.naive_amp import NaiveAMPModel -def is_model_parallel_parameter(p): - return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL) - - def is_replica_zero_parallel_parameter(p): return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL) -def is_sequence_data_parallel_parameter(p): +def is_tensor_data_parallel_parameter(p): + return ( + gpc.is_initialized(ParallelMode.TENSOR) + and gpc.config.parallel.tensor.mode == "isp" + and hasattr(p, IS_TENSOR_DATA_PARALLEL) + and getattr(p, IS_TENSOR_DATA_PARALLEL) + ) + + +def is_tensor_zero_parallel_parameter(p): return ( - gpc.is_initialized(ParallelMode.SEQUENCE) - and hasattr(p, IS_SEQUENCE_DATA_PARALLEL) - and getattr(p, IS_SEQUENCE_DATA_PARALLEL) + gpc.is_initialized(ParallelMode.TENSOR) + and gpc.config.parallel.tensor.mode != "isp" + and hasattr(p, IS_TENSOR_ZERO_PARALLEL) + and getattr(p, IS_TENSOR_ZERO_PARALLEL) ) def is_weight_zero_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.WEIGHT) + and gpc.config.parallel.tensor.mode == "isp" and hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL) ) @@ -45,43 +52,22 @@ def sync_model_param(model): Args: model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - if gpc.is_initialized(ParallelMode.WEIGHT_DATA) and gpc.get_world_size(ParallelMode.WEIGHT_DATA) > 1: - sync_moe_param = ( - gpc.is_initialized(ParallelMode.EXPERT_DATA) and gpc.get_world_size(ParallelMode.EXPERT_DATA) > 1 - ) - for param in model.parameters(): - if sync_moe_param and getattr(param, "is_expert", False): - ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA)) - else: - ranks = gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.WEIGHT_DATA)) - - -def sync_model_param_within_tp(model): - r"""This function is changed from colossalai, which is ``sync_model_param``. - - We modified this function to make sure it only sync parameters within tensor parallelism - but they are not splitted by tensor parallelism. - This function is used to make sure parameters that are not splitted by tensor parallelism - are the same across each tensor parallelism. - For example, parameters like RMSNorm, LayerNorm... - Args: - model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. - """ - parallel_mode = ParallelMode.TENSOR - if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1: - for param in model.parameters(): - if not is_model_parallel_parameter(param): - ranks = gpc.get_ranks_in_group(parallel_mode) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) + sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA) + sync_parallel_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA + for param in model.parameters(): + if sync_moe_param and getattr(param, "is_expert", False): + ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.EXPERT_DATA)) + else: + ranks = gpc.get_ranks_in_group(sync_parallel_mode) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(sync_parallel_mode)) def sync_model_replica_param_group(model): r"""This function is changed from colossalai, which is ``sync_model_param``. - We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in world size. + We modified this function to make sure it only sync IS_REPLICA_ZERO_PARALLEL parameters in tp or wp process group. This function is used to make sure parameters that are not splitted are the same across each rank. For example, parameters like RMSNorm, LayerNorm... @@ -89,10 +75,12 @@ def sync_model_replica_param_group(model): model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - for param in model.parameters(): - if is_replica_zero_parallel_parameter(param): - ranks = gpc.get_ranks_in_group(ParallelMode.GLOBAL) - dist.broadcast(param, src=ranks[0], group=gpc.get_group(ParallelMode.GLOBAL)) + parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.TENSOR + if gpc.is_using_parallel_mode(parallel_mode): + for param in model.parameters(): + if is_replica_zero_parallel_parameter(param): + ranks = gpc.get_ranks_in_group(parallel_mode) + dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode)) def get_parallel_log_file_name(): diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index a30cfba0..51f49836 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -68,7 +68,9 @@ def train( ), f"pipeline parallel size: {gpc.get_world_size(ParallelMode.PIPELINE)} is not as expected {pp_size}" if interleaved: assert ( - gpc.is_using_pp() and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks == num_chunks + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) + and hasattr(gpc.config.model, "num_chunks") + and gpc.config.model.num_chunks == num_chunks ) assert gpc.config.parallel["pipeline"].get( "interleaved_overlap", False @@ -134,7 +136,7 @@ def train( SchedulerMetricHook( metric=metric, skip=( - gpc.is_using_pp() + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) diff --git a/train.py b/train.py index b64d3011..b76c100d 100644 --- a/train.py +++ b/train.py @@ -79,7 +79,7 @@ def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]: SchedulerMetricHook( metric=metric, skip=( - gpc.is_using_pp() + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) @@ -342,33 +342,6 @@ def main(args): initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed) assert hasattr(gpc, "config") and gpc.config is not None - print( - f"ht debug rank:{gpc.get_global_rank()} ranks_in_tp_group:{gpc.get_ranks_in_group(ParallelMode.TENSOR)}", - flush=True, - ) - print( - f"ht debug rank:{gpc.get_global_rank()} ranks_in_wp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT)}", - flush=True, - ) - print( - f"ht debug rank:{gpc.get_global_rank()} ranks_in_dp_group:{gpc.get_ranks_in_group(ParallelMode.DATA)}", - flush=True, - ) - print( - f"ht debug rank:{gpc.get_global_rank()} ranks_in_pp_group:{gpc.get_ranks_in_group(ParallelMode.PIPELINE)}", - flush=True, - ) - # print( - # f"ht debug rank:{gpc.get_global_rank()} ranks_in_wdp_group:{gpc.get_ranks_in_group(ParallelMode.WEIGHT_DATA)}", - # flush=True, - # ) - print( - f"ht debug rank:{gpc.get_global_rank()} ranks_in_zero1_group:{gpc.get_ranks_in_group(ParallelMode.ZERO1)}", - flush=True, - ) - - assert False - # initialize monitor manager context with initialize_monitor_manager( job_name=gpc.config.JOB_NAME, alert_address=gpc.config.monitor.alert.feishu_alert_address From e0cafb07bc833991602e1ad8c926820341317a20 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 21 Dec 2023 15:38:22 +0800 Subject: [PATCH 088/153] fix(overlap_handler.py): fix hook error and param group split --- internlm/core/context/parallel_context.py | 9 +++------ internlm/model/overlap_handler.py | 2 +- internlm/train/utils.py | 3 ++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 53416761..7e357234 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -275,15 +275,12 @@ def is_first_rank(self, parallel_mode: ParallelMode): def is_rank_for_log(self): """Returns a boolean value indicating whether the current device should print log.""" - # is_log_rank = ( - # self.is_first_rank(ParallelMode.DATA) - # and self.is_first_rank(ParallelMode.TENSOR) - # and self.is_last_rank(ParallelMode.PIPELINE) - # ) is_log_rank = ( - self.is_first_rank(ParallelMode.WEIGHT) + self.is_first_rank(ParallelMode.TENSOR) + and self.is_first_rank(ParallelMode.WEIGHT) and self.is_first_rank(ParallelMode.DATA) and self.is_first_rank(ParallelMode.WEIGHT_DATA) + and self.is_last_rank(ParallelMode.PIPELINE) ) return is_log_rank diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index c81b09d0..65473a6b 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -287,7 +287,7 @@ def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: dis def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disable=W0613 fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1] - if module in fstp_modules: + for module in fstp_modules: self._all_gather_module_weight(module) _wait_handle(module) diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 54e75ccb..7ef0cb81 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -139,7 +139,8 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy pgroup["optimizer_mode"] = ParallelMode.ZERO1 # param groups may contain empty groups, such as fp32 - param_groups.extend(new_groups.values()) + if len(new_groups["embed_head"]["params"]) > 0: + param_groups.extend(new_groups.values()) # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) From 7974a32632e1309d2dfafb644f6de781c93f2851 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 22 Dec 2023 19:13:16 +0800 Subject: [PATCH 089/153] fix(overlap_handler.py): fix clear weight error when activation ckpt is True --- internlm/model/overlap_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index 65473a6b..a5649eae 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -293,7 +293,7 @@ def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disab def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 _clear_handle(module) - if not self.model_checkpoint: + if not (self.model_checkpoint and self.is_forward is False): _clear_weight(module) def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # pylint: disable=W0613 From 3361350348801318cac32b7c81fdabe0347792a2 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 25 Dec 2023 12:02:23 +0800 Subject: [PATCH 090/153] fix(parallel_context.py): fix seed mode when TENSOR parallel --- internlm/core/context/parallel_context.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 7e357234..6e7efaae 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -599,8 +599,10 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False): # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform # additional random operations during the RowParallelLinear module building process. # set_mode(ParallelMode.DUMMY) - set_mode(ParallelMode.TENSOR) - set_mode(ParallelMode.WEIGHT) + if self.is_using_parallel_mode(ParallelMode.TENSOR): + set_mode(ParallelMode.TENSOR) + if self.is_using_parallel_mode(ParallelMode.WEIGHT): + set_mode(ParallelMode.WEIGHT) seeds = get_seeds() seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()]) From 9b22258a9a0eecbec334caf16326c8c556285cb9 Mon Sep 17 00:00:00 2001 From: "chenxun.p" <759046501@qq.com> Date: Tue, 26 Dec 2023 10:21:29 +0800 Subject: [PATCH 091/153] feat(*) refactor fstp handler --- internlm/core/communication/isp.py | 501 ++++++++++++++++++ internlm/model/linear.py | 32 +- internlm/model/overlap_handler.py | 391 -------------- internlm/model/utils.py | 453 ++++++---------- .../solver/optimizer/hybrid_zero_optim.py | 26 +- internlm/train/training_internlm.py | 32 +- train.py | 19 +- 7 files changed, 744 insertions(+), 710 deletions(-) create mode 100644 internlm/core/communication/isp.py delete mode 100644 internlm/model/overlap_handler.py diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py new file mode 100644 index 00000000..24e8201b --- /dev/null +++ b/internlm/core/communication/isp.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from typing import Dict, List, Union +from functools import partial +from dataclasses import dataclass + +import torch +from torch import nn +from torch import distributed as dist + +from internlm.core.context import ParallelMode +from internlm.core.context import global_context as gpc +from internlm.core.naive_amp import NaiveAMPModel +from internlm.core.scheduler import SchedulerHook +from internlm.model.embedding import Embedding1D +from internlm.model.linear import ISPLinear, ScaleColumnParallelLinear +from internlm.model.utils import all_gather_raw, reduce_scatter_raw + + +@dataclass +class ISPCommModelConfig: + """ + model config for isp communicator. + """ + hidden_size: int = 0 + mlp_ratio: float = 0 + dtype: torch.dtype = torch.half + device: torch.device = torch.device("cuda") + modules: List[str] = None + + +class MemoryPool: + """ + memory pool for isp communication. + """ + + def __init__( + self, + model_conf: ISPCommModelConfig, + with_bias: bool = False, + ) -> None: + self._hidden_size = model_conf.hidden_size + self._mlp_ratio = model_conf.mlp_ratio + self._dtype = model_conf.dtype + self._device = model_conf.device + self._module_shapes = self._init_module_shape(model_conf.modules) + + # due to intern sequence parallel communication overlap, we need + # **two** memory pools for current block weights and the next block weights. + self.__all_gather_pool_len = 2 + # memory pool for weight all gather communications. + self._all_gather_weight_memory_pool = [ + { + name: torch.zeros(shape, dtype=self._dtype, device=self._device).contiguous() + for name, shape in self._module_shapes.items() + } + for _ in range(self.__all_gather_pool_len) + ] + # memory pool for bias all gather communications. + if not with_bias: + self._all_gather_bias_memory_pool = None + else: + self._all_gather_bias_memory_pool = [ + { + name: torch.zeros(shape[0], dtype=self._dtype, device=self._device).contiguous() + for name, shape in self._module_shapes.items() + } + for _ in range(self.__all_gather_pool_len) + ] + + # memory pool for reduce scatter communications, allocated lazily. + self._reduce_scatter_memory_pool = {} + # memory pool for constant zero tensors, allocated lazily. + self._zero_const_pool = {} + + def _init_module_shape(self, modules: List[str]) -> Dict[str, torch.Size]: + mlp_hidden_size = 256 * ((int(self._hidden_size * self._mlp_ratio) + 256 - 1) // 256) + + # TODO: the memory pool should be more generic. + # Currently, it only supports llama-class models with specific naming structure. + static_shapes = { + "Wqkv": torch.Size((3 * self._hidden_size, self._hidden_size)), + "out_proj": torch.Size((self._hidden_size, self._hidden_size)), + "w1": torch.Size((mlp_hidden_size, self._hidden_size)), + "w2": torch.Size((mlp_hidden_size, self._hidden_size)), + "w3": torch.Size((self._hidden_size, mlp_hidden_size)), + } + + return {name: static_shapes[name] for name in modules} + + def allocate_constant_zero(self, size: tuple) -> torch.Tensor: + if size not in self._zero_const_pool: + self._zero_const_pool[size] = torch.zeros(*size, dtype=self._dtype, device=self._device).contiguous() + + return self._zero_const_pool[size] + + def allocate_all_gather_memory(self, block_index: int, module_name: str, is_bias: bool = False) -> torch.Tensor: + # TODO: should we trace the usage of each memory block to avoid reusing + # same memory block, which will hides some potential bugs. + if not is_bias: + mem = self._all_gather_weight_memory_pool[block_index % 2][module_name] + else: + enable_bias = self._all_gather_bias_memory_pool is not None + assert enable_bias, "memory bool for bias is disabled." + + mem = self._all_gather_bias_memory_pool[block_index % 2][module_name] + + return mem + + def allocate_reduce_scatter_memory(self, key: tuple) -> torch.Tensor: + # if key not in dict + if key not in self._reduce_scatter_memory_pool: + self._reduce_scatter_memory_pool[key] = [] + + for index, mem_item in enumerate(self._reduce_scatter_memory_pool[key]): + if mem_item.idle is True: + self._reduce_scatter_memory_pool[key][index].idle = False + return self._reduce_scatter_memory_pool[key][index] + + # if the memory pool is all used + new_item = torch.zeros( + key, + dtype=self._dtype, + device=self._device, + ).contiguous() + setattr(new_item, "idle", False) + setattr(new_item, "index", len(self._reduce_scatter_memory_pool[key])) + self._reduce_scatter_memory_pool[key].append(new_item) + + return new_item + + def free_reduce_scatter_memory(self, key, index): + self._reduce_scatter_memory_pool[key][index].idle = True + + def reset_lazy_pools(self) -> None: + # Should memory pool re-allocate all gather memory for every interation? + # Currently, it just clear the memory pool for reduce scatter communication. + self._zero_const_pool = {} + self._reduce_scatter_memory_pool = {} + + +class ISPCommunicator: + """ + ISP Communicator for managing the all-gather and reduce_scatter of Intern Sequence Parallel. + """ + + def __init__( + self, + model: Union[nn.Module, nn.ModuleList], + model_conf: ISPCommModelConfig, + overlap: bool = False, + activation_checkpointing: bool = False, + enable_memory_pool: bool = False, + process_group: dist.ProcessGroup = None, + ) -> None: + self.process_group = process_group + self.model_checkpoint = activation_checkpointing + self.overlap = overlap + self.enable_memory_pool = overlap and enable_memory_pool + self.model_conf = model_conf + self.is_forward = True + + self._isp_outs = [] + self._isp_modules = [] + self._module_name = model_conf.modules.copy() + + # key: isp module; value: module global all-gather op handle + self._weight_global_handle = {} + # key: isp module; value: module bias global all-gather op handle + self._bias_global_handle = {} + self.reduce_scatter_handlers = {} + # key: isp module; value: module global weight after all-gather op + self._weight_global_output = {} + # key: isp module; value: module bias global weight after all-gather op + self._bias_global_output = {} + # key: isp module; value: transformer block index + self._module_to_index = {} + # key: transformer block index; value: isp modules + self._index_to_isp_module = {} + self._last_block = None + self._head = [] + self._embedding = [] + + # just want to share same for loop for ModuleList and Module + model = model if isinstance(model, nn.ModuleList) else [model] + for chunk in model: + if isinstance(chunk, NaiveAMPModel): + chunk = chunk.model + self._parse_model_structure(chunk) + + self.num_blocks = len(self._index_to_isp_module) + + if self.enable_memory_pool: + self.memory_pool = MemoryPool(model_conf) + else: + self.memory_pool = None + + if self.overlap: + self._register_sync_parameters_hook() + + def _parse_model_structure(self, model: nn.Module) -> None: + # Important: only works for llama-class models + for chunk_name, children in model.named_children(): + if isinstance(children, ScaleColumnParallelLinear): + setattr(children, "isp_name", "head") + self._head.append(children) + elif isinstance(children, Embedding1D): + self._embedding.append(children) + elif isinstance(children, nn.ModuleList): + self._last_block = children[-1] + + for idx, block in enumerate(children): + self._index_to_isp_module[idx] = [] + for sub_name, sub in block.named_children(): + for name, child in sub.named_children(): + if name == "out_proj": + self._isp_outs.append(child) + self._module_to_index[child] = idx + if isinstance(child, ISPLinear): + self._module_to_index[child] = idx + self._isp_modules.append(child) + self._index_to_isp_module[idx].append(child) + + setattr(child, "isp_name", name) + + full_name = f"{chunk_name}.{idx}.{sub_name}.{name}" + setattr( + child.weight, + "isp_reduce_scatter_name", + f"{full_name}.weight", + ) + if child.bias is not None: + setattr( + child.bias, + "isp_reduce_scatter_name", + f"{full_name}.bias", + ) + + def _all_gather_module_weight(self, module): + with_bias = module.bias is not None + block_index = self._module_to_index[module] + + # prepare memory pool allocator for weight and bias. + if self.enable_memory_pool: + weight_memory_pool_allocator = partial( + self.memory_pool.allocate_all_gather_memory, + block_index, + module.isp_name, + ) + else: + weight_memory_pool_allocator = None + + if self.enable_memory_pool and with_bias: + bias_memory_pool_allocator = partial( + self.memory_pool.allocate_all_gather_memory, + block_index, + module.isp_name, + is_bias=True, + ) + else: + bias_memory_pool_allocator = None + + # submit the all-gather communication for weight and bias. + if with_bias: + bias_output, bias_handle = all_gather_raw( + module.bias, + self.process_group, + async_op=True, + memory_pool_allocator=bias_memory_pool_allocator, + ) + self._bias_global_handle[module] = bias_handle + self._bias_global_output[module] = bias_output + + weight_output, weight_handle = all_gather_raw( + module.weight, + self.process_group, + async_op=True, + memory_pool_allocator=weight_memory_pool_allocator, + ) + self._weight_global_handle[module] = weight_handle + self._weight_global_output[module] = weight_output + + def _all_gather_block_weight(self, block_index: int): + for module in self._index_to_isp_module[block_index]: + self._all_gather_module_weight(module) + + def _wait_handle(self, module): + handle = self._weight_global_handle[module] + handle.wait() + if module.bias is not None: + bias_handle = self._bias_global_handle[module] + bias_handle.wait() + + def _clear_handle(self, module): + if module in self._weight_global_handle: + del self._weight_global_handle[module] + if module in self._bias_global_handle: + del self._bias_global_handle[module] + + def _clear_weight(self, module): + if module in self._weight_global_output: + del self._weight_global_output[module] + if module in self._bias_global_output: + del self._bias_global_output[module] + + def _post_forward_hook_for_embedding(self, *args): # pylint: disable=W0613 + """ + prefetch weight for block 0 after embedding forward. + """ + self._all_gather_block_weight(0) + + def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args): # pylint: disable=W0613 + block_index = self._module_to_index[module] + + if self.model_checkpoint and self.is_forward is False: + if block_index - 1 >= 0: + self._all_gather_block_weight(block_index - 1) + else: + # start the all-gather for next block + if block_index + 1 < self.num_blocks: + self._all_gather_block_weight(block_index + 1) + + def _pre_forward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 + if module not in self._weight_global_handle: + self._all_gather_module_weight(module) + + self._wait_handle(module) + + def _pre_forward_hook_for_block(self, *args): # pylint: disable=W0613 + for module in self._index_to_isp_module[self.num_blocks - 1]: + self._all_gather_module_weight(module) + self._wait_handle(module) + + def _post_forward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 + self._clear_handle(module) + if not self.model_checkpoint: + self._clear_weight(module) + + def _post_backward_hook_for_head(self, *args): # pylint: disable=W0613 + self._all_gather_module_weight(self._isp_modules[-1]) + + def _pre_backward_hook_for_head(self, *args): # pylint: disable=W0613 + if self.is_forward is False: + self._all_gather_block_weight(self.num_blocks - 1) + + def _pre_backward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 + # wait handle for current module + if module not in self._weight_global_handle: + self._all_gather_module_weight(module) + + self._wait_handle(module) + + # start the all-gather for next module + module_index = self._isp_modules.index(module) + if module_index - 1 >= 0: + next_module = self._isp_modules[module_index - 1] + self._all_gather_module_weight(next_module) + + def _post_backward_hook_for_module(self, module, *args): # pylint: disable=W0613 + self._clear_handle(module) + self._clear_weight(module) + + def _register_sync_parameters_hook(self) -> None: + """ + register forward hooks and backward hooks for isp modules. + """ + # register forward hooks + # 1. register post_forward_hook @embedding module to prefetch for block 0 + # 2. register pre_forward_hook @out_proj module to prefetch for next block, + # notice that next block's all_gather op should be after current block's all_to_all op + # 3. register pre_forward_hook @isp_module to wait handle for current module + # 4. register post_forward_hook @isp_module to release resource + for embedding in self._embedding: + embedding.register_forward_hook(self._post_forward_hook_for_embedding) + + if self.model_checkpoint: + if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): + for head in self._head: + head.register_full_backward_pre_hook(self._pre_backward_hook_for_head) + else: + self._last_block.register_forward_pre_hook(self._pre_forward_hook_for_block) + + for out_proj in self._isp_outs: + out_proj.register_forward_pre_hook(self._pre_forward_hook_for_out_proj) + + for module in self._isp_modules: + module.register_forward_pre_hook(self._pre_forward_hook_for_module) + module.register_forward_hook(self._post_forward_hook_for_module) + + # register backward hooks + # 1. register post_backward_hook @head module to prefetch for the last block's last module + # 2. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module + # 3. register post_backward_hook @isp_module to release resource + if not self.model_checkpoint: + for head in self._head: + head.register_full_backward_hook(self._post_backward_hook_for_head) + + for module in self._isp_modules: + module.register_full_backward_pre_hook(self._pre_backward_hook_for_module) + + for module in self._isp_modules: + module.register_full_backward_hook(self._post_backward_hook_for_module) + + def _get_constant_zero(self, size: tuple) -> torch.Tensor: + if self.enable_memory_pool: + return self.memory_pool.allocate_constant_zero(size) + else: + return torch.zeros( + *size, + dtype=self.model_conf.dtype, + device=self.model_conf.device, + ).contiguous() + + # communication operation interfaces + + def all_gather(self, tensor: torch.Tensor, module: nn.Module, is_bias: bool = False): + if dist.get_world_size(self.process_group) <= 1: + return tensor + + if not self.overlap: + result, _ = all_gather_raw(tensor, self.process_group, async_op=False) + elif is_bias: + result = self._bias_global_output[module] + else: + result = self._weight_global_output[module] + + return result + + def reduce_scatter( + self, + tensor: torch.Tensor, + model: nn.Module, + op: dist.ReduceOp, + is_bias: bool = False, + ): + if dist.get_world_size(self.process_group) <= 1: + return tensor + + if not self.overlap: + result, handle = reduce_scatter_raw(tensor, self.process_group, op=op, async_op=True) + else: + if is_bias: + assert hasattr(model.bias, "isp_reduce_scatter_name") + key = getattr(model.bias, "isp_reduce_scatter_name") + else: + assert hasattr(model.weight, "isp_reduce_scatter_name") + key = getattr(model.weight, "isp_reduce_scatter_name") + + self.reduce_scatter_handlers[key] = reduce_scatter_raw( + tensor, + self.process_group, + op=op, + async_op=True, + memory_pool_allocator=self.memory_pool.allocate_reduce_scatter_memory, + ) + + result, handle = ( + self._get_constant_zero( + ( + tensor.shape[0] // dist.get_world_size(self.process_group), + *tensor.shape[1:], + ) + ), + None, + ) + + return result, handle + + +class ISPCommunicatorSchedulerHook(SchedulerHook): + """ + SchedulerHook for isp overlap handler + """ + + def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None: + self._isp_communicator = overlap_handler + self._zero_optim = zero_optim + + def before_forward(self, scheduler, inputs) -> None: + if self._isp_communicator.model_checkpoint: + self._isp_communicator.is_forward = True + + def after_forward(self, scheduler, outputs) -> None: + pass + + def before_criterion(self, scheduler, outputs, label) -> None: + pass + + def after_criterion(self, scheduler, loss) -> None: + pass + + def before_backward(self, scheduler, outputs, outputs_grad) -> None: + if self._isp_communicator.model_checkpoint: + self._isp_communicator.is_forward = False + + def after_backward(self, scheduler, inputs_grad) -> None: + self._zero_optim.accumulate_left_grads_after_backward() + + def post_helper_func(self, scheduler, outputs, label) -> None: + pass diff --git a/internlm/model/linear.py b/internlm/model/linear.py index fc5175d9..d475214f 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -12,7 +12,7 @@ from internlm.core.context import global_context as gpc from internlm.model.utils import ( Silu, - fstp_fused_dense_func, + isp_fused_dense_func, fused_dense_func_torch, megatron_fused_dense_func_torch, ) @@ -350,21 +350,29 @@ def __init__( ) -class FSTPLinear(ColumnParallelLinear): +class ISPLinear(ColumnParallelLinear): + # class level communicator variable. + __communicator = None + + @staticmethod + def register_communicator(communicator): + ISPLinear.__communicator = communicator + def forward(self, x): - return fstp_fused_dense_func( + assert self.__communicator is not None, "ISPLinear should be register with a communicator first." + + return isp_fused_dense_func( x, self.weight, self.bias, - process_group=self.process_group, module=self, - handler=gpc.fstp_handler, + communicator=self.__communicator, ) -class FSTPFeedForward(BaseFeedForward): +class ISPFeedForward(BaseFeedForward): """ - FeedForward in FSTP. + FeedForward in ISP. Args: in_features (int): size of each input sample @@ -398,8 +406,8 @@ def __init__( device, dtype, multiple_of, - FSTPLinear, - FSTPLinear, + ISPLinear, + ISPLinear, ) @@ -409,7 +417,7 @@ def get_mlp_cls(tp_mode: str): elif tp_mode == "msp": mlp_cls = MegatronFeedForward else: - mlp_cls = FSTPFeedForward + mlp_cls = ISPFeedForward return mlp_cls @@ -420,12 +428,12 @@ def get_linear_cls(tp_mode: str, parallel_mode: str): elif tp_mode == "msp": cls = MegatronColumnParallelLinearTorch else: - cls = FSTPLinear + cls = ISPLinear elif parallel_mode == "row": if tp_mode in ["mtp", "fsp"]: cls = RowParallelLinearTorch elif tp_mode == "msp": cls = MegatronRowParallelLinearTorch else: - cls = FSTPLinear + cls = ISPLinear return cls diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py deleted file mode 100644 index c81b09d0..00000000 --- a/internlm/model/overlap_handler.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from typing import Any, Union - -import torch -from torch import nn - -from internlm.core.context import ParallelMode -from internlm.core.context import global_context as gpc -from internlm.core.naive_amp import NaiveAMPModel -from internlm.core.scheduler import SchedulerHook -from internlm.model.embedding import Embedding1D -from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear -from internlm.model.utils import ( - all_gather_raw, - all_gather_raw_bias_memory_pool, - all_gather_raw_memory_pool, -) -from internlm.utils.common import get_current_device - - -class FSTPOverlapHandler: - """ - FSTP overlap handler for managing the all-gather and reduce_scatter overlapping. - """ - - def __init__(self, model: Union[nn.Module, nn.ModuleList], process_group) -> None: - self.process_group = process_group - self.fstp_outs = [] - self.fstp_modules = [] - self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"] - self.weight_global_handle = dict() # key: fstp module; value: module global all-gather op handle - self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle - self.weight_global_output = dict() # key: fstp module; value: module global weight after all-gather op - self.bias_global_output = dict() # key: fstp module; value: module bias global weight after all-gather op - self.module_to_index = dict() # key: fstp module; value: transformer block index - self.index_to_fstp_modules = dict() # key: transformer block index; value: fsdp modules - self.last_block = None - self.head = [] - self.embedding = [] - self.model_checkpoint = gpc.config.model.checkpoint - self.enable_memory_pool = gpc.config.parallel["weight"].get("memory_pool", False) - self.is_forward = True - - self.reduce_scatter_handlers = {} - self.zero_const_pool = {} - - # just want to share same for loop for ModuleList and Module - if not isinstance(model, nn.ModuleList): - model = [model] - - for _chunk in model: - if isinstance(_chunk, NaiveAMPModel): - _chunk = _chunk.model - - for _chunk_name, children in _chunk.named_children(): - if isinstance(children, ScaleColumnParallelLinear): - setattr(children, "_fstp_name", "head") - self.head.append(children) - elif isinstance(children, Embedding1D): - self.embedding.append(children) - elif isinstance(children, nn.ModuleList): - self.last_block = children[len(children) - 1] - for idx, block in enumerate(children): - self.index_to_fstp_modules[idx] = [] - for _sub_name, sub in block.named_children(): - for name, child in sub.named_children(): - if name == "out_proj": - self.fstp_outs.append(child) - self.module_to_index[child] = idx - if isinstance(child, FSTPLinear): - self.module_to_index[child] = idx - self.fstp_modules.append(child) - self.index_to_fstp_modules[idx].append(child) - - setattr(child, "_fstp_name", name) - - _full_name = f"{_chunk_name}.{idx}.{_sub_name}.{name}" - setattr(child.weight, "_fstp_reduce_scatter_str", f"{_full_name}.weight") - if child.bias is not None: - setattr(child.bias, "_fstp_reduce_scatter_str", f"{_full_name}.bias") - - self.num_blocks = len(self.index_to_fstp_modules) - - if self.enable_memory_pool: - self._initialize_memory_pool() - self._register_sync_parameters_hook() - - def get_zero_by_shape(self, size: tuple, dtype, device) -> torch.Tensor: - if self.enable_memory_pool: - if size not in self.zero_const_pool: - self.zero_const_pool[size] = torch.zeros(*size, dtype=dtype, device=device).contiguous() - - return self.zero_const_pool[size] - else: - return torch.zeros(*size, dtype=dtype, device=device).contiguous() - - def set_forward_mode(self, flag): - self.is_forward = flag - - def _initialize_module_shape(self): - hidden_size = gpc.config.HIDDEN_SIZE - mlp_ratio = gpc.config.MLP_RATIO - mlp_hidden_size = int(hidden_size * mlp_ratio) - mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256) - - self.module_shape["Wqkv"] = (3 * hidden_size, hidden_size) - self.module_shape["out_proj"] = (hidden_size, hidden_size) - self.module_shape["w1"] = (mlp_hidden_size, hidden_size) - self.module_shape["w2"] = (mlp_hidden_size, hidden_size) - self.module_shape["w3"] = (hidden_size, mlp_hidden_size) - - def _initialize_memory_pool(self) -> None: - # allocate memory pool - self.all_gather_memory_pool = [] - self.all_gather_bias_memory_pool = [] - self.reduce_scatter_memory_pool = {} - self.module_shape = {} - - self._initialize_module_shape() - dtype = gpc.config.model.get("dtype", torch.half) - device = get_current_device() - - for _ in range(2): - weight = {} - for name in self.module_name: - weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous() - self.all_gather_memory_pool.append(weight) # containing two groups of block weight - - def clear_memory_pool(self) -> None: - assert self.enable_memory_pool - - self.zero_const_pool = {} - self.reduce_scatter_memory_pool = {} - - def _get_weight_from_memory_pool(self, module): - assert self.enable_memory_pool - - block_index = self.module_to_index[module] - return self.all_gather_memory_pool[block_index % 2][module._fstp_name] - - def _get_bias_from_memory_pool(self, module: nn.Module): - assert self.enable_memory_pool - - block_index = self.module_to_index[module] - # if the bias memory pool is empty or module has been not allocated memory - if len(self.all_gather_bias_memory_pool) == 0: - for _ in range(2): - weight = {} - weight[module._fstp_name] = torch.zeros( - self.module_shape[module._fstp_name][0], - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - self.all_gather_bias_memory_pool.append(weight) - elif module._fstp_name not in self.all_gather_bias_memory_pool[0]: - for i in range(2): - self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros( - self.module_shape[module._fstp_name][0], - dtype=gpc.config.model.get("dtype", torch.half), - device=get_current_device(), - ).contiguous() - - return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name] - - def get_weight_all_gather(self, module): - if self.enable_memory_pool: - return self._get_weight_from_memory_pool(module) - else: - return self.weight_global_output[module] - - def get_bias_all_gather(self, module): - if self.enable_memory_pool: - return self._get_bias_from_memory_pool(module) - else: - return self.bias_global_output[module] - - def get_reduce_scatter_memory(self, key): - assert self.enable_memory_pool - - # if key not in dict - if key not in self.reduce_scatter_memory_pool: - self.reduce_scatter_memory_pool[key] = [] - - for index, mem_item in enumerate(self.reduce_scatter_memory_pool[key]): - if mem_item.idle is True: - self.reduce_scatter_memory_pool[key][index].idle = False - return self.reduce_scatter_memory_pool[key][index] - - # if the memory pool is all used - cur_len = len(self.reduce_scatter_memory_pool[key]) - self.reduce_scatter_memory_pool[key].append( - torch.zeros(key, dtype=gpc.config.model.get("dtype", torch.half), device=get_current_device()).contiguous() - ) - setattr(self.reduce_scatter_memory_pool[key][cur_len], "idle", False) - setattr(self.reduce_scatter_memory_pool[key][cur_len], "index", cur_len) - return self.reduce_scatter_memory_pool[key][cur_len] - - def release_reduce_scatter_memory(self, key, index): - assert self.enable_memory_pool - self.reduce_scatter_memory_pool[key][index].idle = True - - def _all_gather_module_weight(self, module): - if self.enable_memory_pool: - if module.bias is not None: - bias_handle = all_gather_raw_bias_memory_pool( - module.bias, - self.process_group, - async_op=True, - module=module, - ) - self.bias_global_handle[module] = bias_handle - - weight_handle = all_gather_raw_memory_pool( - module.weight, - self.process_group, - async_op=True, - module=module, - ) - self.weight_global_handle[module] = weight_handle - else: - if module.bias is not None: - bias_output, bias_handle = all_gather_raw( - module.bias, - self.process_group, - async_op=True, - ) - self.bias_global_handle[module] = bias_handle - self.bias_global_output[module] = bias_output - - weight_output, weight_handle = all_gather_raw( - module.weight, - self.process_group, - async_op=True, - ) - self.weight_global_handle[module] = weight_handle - self.weight_global_output[module] = weight_output - - def _all_gather_block_weight(self, block_index: int): - fstp_modules = self.index_to_fstp_modules[block_index] - for module in fstp_modules: - self._all_gather_module_weight(module) - - def _register_sync_parameters_hook(self) -> None: - """ - register forward hooks and backward hooks for fstp modules. - """ - - def _wait_handle(module): - handle = self.weight_global_handle[module] - handle.wait() - if module.bias is not None: - bias_handle = self.bias_global_handle[module] - bias_handle.wait() - - def _clear_handle(module): - if module in self.weight_global_handle: - del self.weight_global_handle[module] - if module in self.bias_global_handle: - del self.bias_global_handle[module] - - def _clear_weight(module): - if module in self.weight_global_output: - del self.weight_global_output[module] - if module in self.bias_global_output: - del self.bias_global_output[module] - - def _post_forward_hook_for_embedding(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 - self._all_gather_block_weight(0) - - def _pre_forward_hook_for_out_proj(module: nn.Module, inputs: Any): # pylint: disable=W0613 - block_index = self.module_to_index[module] - if self.model_checkpoint and self.is_forward is False: - if block_index - 1 >= 0: - self._all_gather_block_weight(block_index - 1) - else: - # start the all-gather for next block - if block_index + 1 < self.num_blocks: - self._all_gather_block_weight(block_index + 1) - - def _pre_forward_hook_for_module(module: nn.Module, inputs: Any): # pylint: disable=W0613 - if module not in self.weight_global_handle: - self._all_gather_module_weight(module) - - _wait_handle(module) - - def _pre_forward_hook_for_block(block: nn.Module, inputs: Any): # pylint: disable=W0613 - fstp_modules = self.index_to_fstp_modules[self.num_blocks - 1] - if module in fstp_modules: - self._all_gather_module_weight(module) - _wait_handle(module) - - def _post_forward_hook_for_module(module: nn.Module, inputs: Any, output: Any): # pylint: disable=W0613 - _clear_handle(module) - if not self.model_checkpoint: - _clear_weight(module) - - def _post_backward_hook_for_head(module: nn.Module, grad_input, grad_output): # pylint: disable=W0613 - self._all_gather_module_weight(self.fstp_modules[-1]) - - def _pre_backward_hook_for_head(module: nn.Module, grad_output): - if self.is_forward is False: - self._all_gather_block_weight(self.num_blocks - 1) - - def _pre_backward_hook_for_module(module: nn.Module, grad_output): # pylint: disable=W0613 - # wait handle for current module - if module not in self.weight_global_handle: - self._all_gather_module_weight(module) - - _wait_handle(module) - - # start the all-gather for next module - module_index = self.fstp_modules.index(module) - if module_index - 1 >= 0: - next_module = self.fstp_modules[module_index - 1] - self._all_gather_module_weight(next_module) - - def _post_backward_hook_for_module(module, grad_input, grad_output): # pylint: disable=W0613 - _clear_handle(module) - _clear_weight(module) - - # register forward hooks - # 1. register post_forward_hook @embedding module to prefetch for block 0 - # 2. register pre_forward_hook @out_proj module to prefetch for next block, - # notice that next block's all_gather op should be after current block's all_to_all op - # 3. register pre_forward_hook @fstp_module to wait handle for current module - # 4. register post_forward_hook @fstp_module to release resource - for embedding in self.embedding: - embedding.register_forward_hook(_post_forward_hook_for_embedding) - - if self.model_checkpoint: - if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): - for head in self.head: - head.register_full_backward_pre_hook(_pre_backward_hook_for_head) - else: - self.last_block.register_forward_pre_hook(_pre_forward_hook_for_block) - - for out_proj in self.fstp_outs: - out_proj.register_forward_pre_hook(_pre_forward_hook_for_out_proj) - - for module in self.fstp_modules: - module.register_forward_pre_hook(_pre_forward_hook_for_module) - module.register_forward_hook(_post_forward_hook_for_module) - - # register backward hooks - # 1. register post_backward_hook @head module to prefetch for the last block's last module - # 2. register pre_backward_hook @fstp_module to wait handle for current module and to prefetch for next module - # 3. register post_backward_hook @fstp_module to release resource - if not self.model_checkpoint: - for head in self.head: - head.register_full_backward_hook(_post_backward_hook_for_head) - - for module in self.fstp_modules: - module.register_full_backward_pre_hook(_pre_backward_hook_for_module) - - for module in self.fstp_modules: - module.register_full_backward_hook(_post_backward_hook_for_module) - - -class FSTPOverlapSchedulerHook(SchedulerHook): - """ - SchedulerHook for fstp overlap handler - """ - - def __init__(self, overlap_handler: FSTPOverlapHandler, zero_optim) -> None: - self._overlap_handler = overlap_handler - self._zero_optim = zero_optim - - def before_forward(self, scheduler, inputs) -> None: - if self._overlap_handler.model_checkpoint: - self._overlap_handler.set_forward_mode(True) - - def after_forward(self, scheduler, outputs) -> None: - pass - - def before_criterion(self, scheduler, outputs, label) -> None: - pass - - def after_criterion(self, scheduler, loss) -> None: - pass - - def before_backward(self, scheduler, outputs, outputs_grad) -> None: - if self._overlap_handler.model_checkpoint: - self._overlap_handler.set_forward_mode(False) - - def after_backward(self, scheduler, inputs_grad) -> None: - self._zero_optim.accumulate_left_grads_after_backward() - - def post_helper_func(self, scheduler, outputs, label) -> None: - pass diff --git a/internlm/model/utils.py b/internlm/model/utils.py index a4fe3378..60e2cd99 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -1,17 +1,16 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Optional +from typing import Callable, Optional import fused_dense_lib as fused_dense_cuda import torch import torch.nn.functional as F from flash_attn.utils.distributed import all_reduce_raw -from torch import Tensor, nn +from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd from torch.distributed import ProcessGroup -from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger @@ -114,93 +113,77 @@ def split_forward_gather_backward(input_, parallel_mode, dim): return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim) -def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False, gather_dim: int = 0): - world_size = torch.distributed.get_world_size(process_group) - shape = list(input_.shape) - shape[gather_dim] = shape[gather_dim] * world_size - output = torch.empty(shape, dtype=input_.dtype, device=input_.device) - handle = torch.distributed.all_gather_into_tensor( - output, input_.contiguous(), group=process_group, async_op=async_op - ) - return output, handle - - -def all_gather_raw_memory_pool( +def all_gather_raw( input_: Tensor, process_group: ProcessGroup, async_op: bool = False, - module: nn.Module = None, + gather_dim: int = 0, + memory_pool_allocator: Callable = None, ): + if memory_pool_allocator is not None: + output = memory_pool_allocator() + else: + world_size = torch.distributed.get_world_size(process_group) + shape = list(input_.shape) + shape[gather_dim] = shape[gather_dim] * world_size + output = torch.empty(shape, dtype=input_.dtype, device=input_.device) + handle = torch.distributed.all_gather_into_tensor( - gpc.fstp_handler.get_weight_all_gather(module=module), - input_.contiguous(), - group=process_group, - async_op=async_op, + output, input_.contiguous(), group=process_group, async_op=async_op ) - return handle + return output, handle -def all_gather_raw_bias_memory_pool( +def reduce_scatter_raw( input_: Tensor, process_group: ProcessGroup, + op=torch.distributed.ReduceOp.SUM, async_op: bool = False, - module: nn.Module = None, -): - handle = torch.distributed.all_gather_into_tensor( - gpc.fstp_handler.get_bias_all_gather(module=module), - input_.contiguous(), - group=process_group, - async_op=async_op, - ) - return handle - - -def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): - assert my_input.dtype == grad_output.dtype - grad_weight = torch.matmul(grad_output.t(), my_input) - grad_bias = grad_output.sum(dim=0) if has_d_bias else None - return grad_weight, grad_bias - - -def reduce_scatter_raw( - input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False + memory_pool_allocator: Callable = None, ): world_size = torch.distributed.get_world_size(process_group) assert input_.shape[0] % world_size == 0 - output = torch.empty( - input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device - ).contiguous() - handle = torch.distributed.reduce_scatter_tensor( - output, input_.contiguous(), op=op, group=process_group, async_op=async_op - ) - return output, handle - -def reduce_scatter_raw_memory_pool( - input_: Tensor, process_group: ProcessGroup, op=torch.distributed.ReduceOp.SUM, async_op: bool = False -): - world_size = torch.distributed.get_world_size(process_group) - assert input_.shape[0] % world_size == 0 - if gpc.fstp_handler.enable_memory_pool: + if memory_pool_allocator is not None: size = (input_.shape[0] // world_size, *input_.shape[1:]) - output = gpc.fstp_handler.get_reduce_scatter_memory(size) + output = memory_pool_allocator(size) else: output = torch.empty( - input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device + input_.shape[0] // world_size, + *input_.shape[1:], + dtype=input_.dtype, + device=input_.device, ).contiguous() + handle = torch.distributed.reduce_scatter_tensor( output, input_.contiguous(), op=op, group=process_group, async_op=async_op ) return output, handle +def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): + assert my_input.dtype == grad_output.dtype + grad_weight = torch.matmul(grad_output.t(), my_input) + grad_bias = grad_output.sum(dim=0) if has_d_bias else None + return grad_weight, grad_bias + + # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFunc(torch.autograd.Function): "FusedDenseFunc for tensor parallel in flash-attn implementation." @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0): + def forward( + ctx, + x, + weight, + bias, + return_residual=False, + process_group=None, + sequence_parallel=True, + gather_dim=0, + ): """ If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel with sequence parallelism: we do an all_gather_raw of x before doing the matmul. @@ -265,7 +248,11 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, + weight, + ) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) if process_group is not None: reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw @@ -277,7 +264,9 @@ def backward(ctx, grad_output, *args): if process_group is not None and sequence_parallel: handle_x.wait() grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + total_x.reshape(batch_dim, total_x.shape[-1]), + grad_output, + ctx.needs_input_grad[2], ) else: grad_weight = None @@ -296,7 +285,16 @@ class MegatronFusedDenseFunc(torch.autograd.Function): @staticmethod @custom_fwd - def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True, gather_dim=0): + def forward( + ctx, + x, + weight, + bias, + return_residual=False, + process_group=None, + sequence_parallel=True, + gather_dim=0, + ): """ If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel with sequence parallelism: we do an all_gather_raw of x before doing the matmul. @@ -355,7 +353,11 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, + weight, + ) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) if process_group is not None: reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw @@ -365,7 +367,9 @@ def backward(ctx, grad_output, *args): if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + total_x.reshape(batch_dim, total_x.shape[-1]), + grad_output, + ctx.needs_input_grad[2], ) else: grad_weight = None @@ -405,7 +409,11 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, + weight, + ) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) if process_group is not None: reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw @@ -418,7 +426,9 @@ def backward(ctx, grad_output, *args): handle_x.wait() # we remove the cuda independence, which is different from flash_attn. grad_weight, grad_bias = linear_bias_wgrad_torch( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + total_x.reshape(batch_dim, total_x.shape[-1]), + grad_output, + ctx.needs_input_grad[2], ) else: grad_weight = None @@ -452,7 +462,11 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight) + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, + weight, + ) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) if process_group is not None: reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw @@ -463,7 +477,9 @@ def backward(ctx, grad_output, *args): assert ctx.compute_weight_gradient # we remove the cuda independence, which is different from flash_attn. grad_weight, grad_bias = linear_bias_wgrad_torch( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + total_x.reshape(batch_dim, total_x.shape[-1]), + grad_output, + ctx.needs_input_grad[2], ) else: grad_weight = None @@ -473,8 +489,8 @@ def backward(ctx, grad_output, *args): return grad_input, grad_weight, grad_bias, None, None, None, None -class FSTPFusedDenseFunc(torch.autograd.Function): - "FusedDenseFunc for FSTP, which is optimized based on flash implementation." +class ISPFusedDenseFunc(torch.autograd.Function): + "FusedDenseFunc for ISP, which is optimized based on flash implementation." @staticmethod @custom_fwd @@ -483,247 +499,87 @@ def forward( x, weight, bias, + module, return_residual=False, - process_group=None, - module=None, - overlap_handler=None, + communicator=None, + use_flash_attn: bool = True, ): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual - ctx.process_group = process_group - ctx.overlap_handler = overlap_handler ctx.module = module + ctx.communicator = communicator + ctx.use_flash_attn = use_flash_attn if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) - total_x = x.contiguous() + x = x.contiguous() - world_size = gpc.get_world_size(ParallelMode.WEIGHT) - if world_size > 1: - # do all_gather for weight and bias before actual computation - if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) - else: - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() - - if bias is not None: - if overlap_handler is not None: - total_bias = gpc.fstp_handler.get_bias_all_gather(module=module) - else: - total_bias, handle_bias = all_gather_raw(bias, process_group, async_op=True) - handle_bias.wait() - else: - total_bias = bias - else: - total_weight = weight - total_bias = bias + total_weight = communicator.all_gather(weight, module) + total_bias = bias if bias is None else communicator.all_gather(bias, module, is_bias=True) if torch.is_autocast_enabled(): total_weight = total_weight.to(dtype=torch.get_autocast_gpu_dtype()) - total_bias = total_bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + if total_bias: + total_bias.to(dtype=torch.get_autocast_gpu_dtype()) total_weight = total_weight.contiguous() - batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_shape, n = x.shape[:-1], x.shape[-1] batch_dim = batch_shape.numel() # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 if min(batch_dim, n, *total_weight.shape) > 65535 * 32: raise RuntimeError("fused_dense only supports matrix dims <= 2M") - output = F.linear(total_x, total_weight, total_bias) + + output = F.linear(x, total_weight, total_bias) + # release memory del total_weight del total_bias if ctx.compute_weight_gradient: - ctx.save_for_backward(x, weight, bias) + ctx.save_for_backward(x, weight) else: - ctx.save_for_backward(weight, bias) + ctx.save_for_backward(weight) return output if not return_residual else (output, x) @staticmethod @custom_bwd def backward(ctx, grad_output, *args): - grad_output = grad_output.contiguous() - if ctx.return_residual: - (grad_input,) = args - grad_input = grad_input.contiguous() - process_group = ctx.process_group - overlap_handler = ctx.overlap_handler module = ctx.module + communicator = ctx.communicator - if ctx.compute_weight_gradient: - x, weight, bias = ctx.saved_tensors - total_x = x - else: - weight, bias = ctx.saved_tensors - total_x = None - batch_shape = grad_output.shape[:-1] - batch_dim = batch_shape.numel() - grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - - world_size = gpc.get_world_size(ParallelMode.WEIGHT) - if world_size > 1: - if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) - else: - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() - else: - total_weight = weight - - # compute weight grad - if ctx.needs_input_grad[1]: - assert ctx.compute_weight_gradient - grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] - ) - if world_size > 1: - if overlap_handler is not None: - grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( - grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - assert hasattr(weight, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( - handle_grad_weight, - grad_weight_async, - ) - grad_weight = overlap_handler.get_zero_by_shape( - ( - grad_weight.shape[0] // torch.distributed.get_world_size(process_group), - *grad_weight.shape[1:], - ), - dtype=grad_weight.dtype, - device=grad_weight.device, - ) - if grad_bias is not None: - grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( - grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - assert hasattr(bias, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( - handle_grad_bias, - grad_bias_async, - ) - grad_bias = overlap_handler.get_zero_by_shape( - ( - grad_bias.shape[0] // torch.distributed.get_world_size(process_group), - *grad_bias.shape[1:], - ), - dtype=grad_bias.dtype, - device=grad_bias.device, - ) - else: - grad_weight, handle_grad_weight = reduce_scatter_raw( - grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw( - grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - else: - grad_weight = None - grad_bias = grad_output if ctx.needs_input_grad[2] else None + backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.use_flash_attn else linear_bias_wgrad_torch - if ctx.needs_input_grad[0]: - if not ctx.return_residual: - grad_input = F.linear(grad_output, total_weight.t()) - else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) - grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - else: - grad_input = None - del total_weight - - if ctx.needs_input_grad[1]: - if world_size > 1 and overlap_handler is None: - handle_grad_weight.wait() - if grad_bias is not None: - handle_grad_bias.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None, None, None - - -class FSTPFusedDenseFuncTorch(FSTPFusedDenseFunc): - "FusedDenseFunc for FSTP, which is optimized based on flash implementation." - - @staticmethod - @custom_bwd - def backward(ctx, grad_output, *args): grad_output = grad_output.contiguous() if ctx.return_residual: (grad_input,) = args grad_input = grad_input.contiguous() - process_group = ctx.process_group - overlap_handler = ctx.overlap_handler - module = ctx.module if ctx.compute_weight_gradient: - x, weight, bias = ctx.saved_tensors - total_x = x + x, weight = ctx.saved_tensors else: - weight, bias = ctx.saved_tensors - total_x = None + x, weight = (None, *ctx.saved_tensors) + batch_shape = grad_output.shape[:-1] batch_dim = batch_shape.numel() grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - world_size = gpc.get_world_size(ParallelMode.WEIGHT) - if world_size > 1: - if overlap_handler is not None: - total_weight = gpc.fstp_handler.get_weight_all_gather(module=module) - else: - total_weight, handle_weight = all_gather_raw(weight, process_group, async_op=True) - handle_weight.wait() - else: - total_weight = weight + total_weight = communicator.all_gather(weight, module) # compute weight grad if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient - grad_weight, grad_bias = linear_bias_wgrad_torch( - total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + grad_weight, grad_bias = backward_func( + x.reshape(batch_dim, x.shape[-1]), + grad_output, + ctx.needs_input_grad[2], ) - if world_size > 1: - if overlap_handler is not None: - grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool( - grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - assert hasattr(weight, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = ( - handle_grad_weight, - grad_weight_async, - ) - grad_weight = overlap_handler.get_zero_by_shape( - ( - grad_weight.shape[0] // torch.distributed.get_world_size(process_group), - *grad_weight.shape[1:], - ), - dtype=grad_weight.dtype, - device=grad_weight.device, - ) - if grad_bias is not None: - grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool( - grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - assert hasattr(bias, "_fstp_reduce_scatter_str") - overlap_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = ( - handle_grad_bias, - grad_bias_async, - ) - grad_bias = overlap_handler.get_zero_by_shape( - ( - grad_bias.shape[0] // torch.distributed.get_world_size(process_group), - *grad_bias.shape[1:], - ), - dtype=grad_bias.dtype, - device=grad_bias.device, - ) - else: - grad_weight, handle_grad_weight = reduce_scatter_raw( - grad_weight, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) - if grad_bias is not None: - grad_bias, handle_grad_bias = reduce_scatter_raw( - grad_bias, process_group, op=torch.distributed.ReduceOp.AVG, async_op=True - ) + + grad_weight, grad_weight_sync = communicator.reduce_scatter( + grad_weight, module, op=torch.distributed.ReduceOp.AVG + ) + if grad_bias is not None: + grad_bias, grad_bias_sync = communicator.reduce_scatter( + grad_bias, module, op=torch.distributed.ReduceOp.AVG, is_bias=True + ) else: grad_weight = None grad_bias = grad_output if ctx.needs_input_grad[2] else None @@ -732,17 +588,23 @@ def backward(ctx, grad_output, *args): if not ctx.return_residual: grad_input = F.linear(grad_output, total_weight.t()) else: - grad_input = torch.addmm(grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, total_weight) + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), + grad_output, + total_weight, + ) grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) else: grad_input = None + del total_weight if ctx.needs_input_grad[1]: - if world_size > 1 and overlap_handler is None: - handle_grad_weight.wait() - if grad_bias is not None: - handle_grad_bias.wait() + if grad_weight_sync: + grad_weight_sync.wait() + if grad_bias and grad_bias_sync: + grad_bias_sync.wait() + return grad_input, grad_weight, grad_bias, None, None, None, None, None, None @@ -759,9 +621,25 @@ def fused_dense_func_torch( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FusedDenseFunc.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + return FusedDenseFunc.apply( + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, + ) else: - return FusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim) + return FusedDenseFuncTorch.apply( + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, + ) def megatron_fused_dense_func_torch( @@ -778,30 +656,49 @@ def megatron_fused_dense_func_torch( ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: return MegatronFusedDenseFunc.apply( - x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, ) else: return MegatronFusedDenseFuncTorch.apply( - x, weight, bias, return_residual, process_group, sequence_parallel, gather_dim + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, ) -def fstp_fused_dense_func( +def isp_fused_dense_func( x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, return_residual: bool = False, - process_group=None, module=None, - handler=None, + communicator=None, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FSTPFusedDenseFunc.apply(x, weight, bias, return_residual, process_group, module, handler) + return ISPFusedDenseFunc.apply(x, weight, bias, module, return_residual, communicator) else: - return FSTPFusedDenseFuncTorch.apply(x, weight, bias, return_residual, process_group, module, handler) + return ISPFusedDenseFunc.apply( + x, + weight, + bias, + module, + return_residual, + communicator, + use_flash_attn=False, + ) def try_import_RMSNorm(): diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 681dfc9c..0f2cc1bf 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -58,6 +58,7 @@ def __init__( grad_scal_cfg: Config = None, zero_cfg: Config = None, param_bcast_sync_handler: ParamBcastSyncHandler = None, + isp_communicator = None, ): # DynamicGradScaler related args if gpc.config.model.dtype is torch.float32: @@ -138,10 +139,7 @@ def __init__( if self._overlap_sync_param: assert self._param_bcast_sync_handler is not None - if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: - self._fstp_handler = gpc.fstp_handler - else: - self._fstp_handler = None + self._isp_communicator = isp_communicator # iterate over the param group in the optimizer # partition these param groups for data parallel training @@ -362,9 +360,9 @@ def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 ): accum_grad_obj.register_hook(extra_layernorm_reduce_grad_hook) - # we should not only register for parameters which have _fstp_reduce_scatter_str attr. + # we should not only register for parameters which have isp_reduce_scatter_name attr. # we must keep up with reduce_grad_hook. - if self._fstp_handler is not None: + if self._isp_communicator is not None: accum_grad_obj.register_hook(accum_grad_hook) if self._overlap_sync_grad: @@ -373,7 +371,7 @@ def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 _define_and_attach(param, reduce_rank) def accumulate_left_grads_after_backward(self): - if self._fstp_handler is None: + if self._isp_communicator is None: return for group_id in range(self.num_param_groups): @@ -395,20 +393,22 @@ def belongs_to_current_rank(self, param) -> bool: def _accum_grads_store_in_bucket(self, bucket: BucketStore, reduce_rank: Optional[int] = None) -> None: for _param in bucket.get_param(reduce_rank): - if not hasattr(_param, "_fstp_reduce_scatter_str"): + if not hasattr(_param, "isp_reduce_scatter_name"): continue # wait and accumulate gardient. - _key = getattr(_param, "_fstp_reduce_scatter_str") - _comm_handle, _grad = self._fstp_handler.reduce_scatter_handlers[_key] + _key = getattr(_param, "isp_reduce_scatter_name") + _grad, _comm_handle = self._isp_communicator.reduce_scatter_handlers[_key] _comm_handle.wait() _param.grad.add_(_grad) # release cuda memory. - if self._fstp_handler.enable_memory_pool: - self._fstp_handler.release_reduce_scatter_memory(key=tuple(_grad.size()), index=_grad.index) + if self._isp_communicator.enable_memory_pool: + self._isp_communicator.memory_pool.free_reduce_scatter_memory( + key=tuple(_grad.size()), index=_grad.index + ) _grad = None - self._fstp_handler.reduce_scatter_handlers[_key] = None + self._isp_communicator.reduce_scatter_handlers[_key] = None bucket.reset_by_rank(reduce_rank) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 587c0035..2822da5a 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -44,7 +44,8 @@ RowParallelLinear, ) from internlm.model.multi_head_attention import MHA -from internlm.model.overlap_handler import FSTPOverlapHandler +from internlm.model.linear import ISPLinear +from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig from internlm.model.utils import try_import_RMSNorm from internlm.monitor import send_heartbeat, set_env_var from internlm.monitor.monitor import monitor_manager as mm @@ -53,7 +54,7 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile +from internlm.utils.common import DummyProfile, get_current_device from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( @@ -175,11 +176,27 @@ def initialize_model(): # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - gpc.fstp_handler = None - if gpc.config.parallel["weight"]["size"] > 1 and gpc.config.parallel["weight"]["overlap"] is True: - gpc.fstp_handler = FSTPOverlapHandler(model, gpc.get_group(ParallelMode.WEIGHT)) + if gpc.config.parallel.tensor.mode != "isp": + isp_communicator = None + else: + isp_communicator = ISPCommunicator( + model, + ISPCommModelConfig( + gpc.config.model.hidden_size, + gpc.config.model.mlp_ratio, + gpc.config.model.dtype, + get_current_device(), + ["Wqkv", "out_proj", "w1", "w2", "w3"], + ), + gpc.config.parallel.weight.overlap, + gpc.config.model.checkpoint, + gpc.config.parallel.weight.memory_pool, + gpc.get_group(ParallelMode.WEIGHT), + ) + # register communicator for isp linear. + ISPLinear.register_communicator(isp_communicator) - return model + return model, isp_communicator def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): @@ -216,7 +233,7 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): @llm_timeout(func_name="initialize_optimizer") -def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): +def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicator: ISPCommunicator = None): """ Initialize optimizer. @@ -250,6 +267,7 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]): grad_scal_cfg=gpc.config.grad_scaler, zero_cfg=gpc.config.hybrid_zero_optimizer, param_bcast_sync_handler=param_bcast_sync_handler, + isp_communicator=isp_communicator, ) else: optimizer = FSDPadaptOptimizer( diff --git a/train.py b/train.py index b76c100d..5e048a32 100644 --- a/train.py +++ b/train.py @@ -19,7 +19,7 @@ from internlm.initialize import initialize_distributed_env from internlm.model.loss import FlashGPTLMLoss from internlm.model.metrics import AccPerplex, SchedulerMetricHook -from internlm.model.overlap_handler import FSTPOverlapSchedulerHook +from internlm.core.communication.isp import ISPCommunicatorSchedulerHook from internlm.monitor import initialize_monitor_manager, send_alert_message from internlm.monitor.monitor import monitor_manager as mm from internlm.train import ( @@ -71,7 +71,7 @@ def initialize_llm_logger(start_time: str): return uniscale_logger -def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]: +def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]: scheduler_hooks: List[SchedulerHook] = [] if metric is not None: @@ -86,8 +86,9 @@ def get_scheduler_hooks(metric, zero_optim) -> List[SchedulerHook]: ), ), ) - if gpc.fstp_handler is not None: - scheduler_hooks.append(FSTPOverlapSchedulerHook(gpc.fstp_handler, zero_optim)) + + if isp_communicator is not None: + scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim)) return scheduler_hooks @@ -133,7 +134,7 @@ def main(args): uniscale_logger = initialize_llm_logger(start_time=current_time) # initialize model - model = initialize_model() + model, isp_communicator = initialize_model() with open(args.config, "r") as f: config_lines = f.readlines() @@ -148,7 +149,7 @@ def main(args): # initialize and resume train state train_state = TrainState(gpc.config, train_dl.batch_sampler) - optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) + optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator) ckpt_manager = CheckpointManager( ckpt_config=gpc.config.ckpt, @@ -194,7 +195,7 @@ def main(args): train_dataloader=train_dl, lr_scheduler=lr_scheduler, beta2_scheduler=beta2_scheduler, - scheduler_hooks=get_scheduler_hooks(metric, optimizer), + scheduler_hooks=get_scheduler_hooks(metric, optimizer, isp_communicator), ) # initialize simple memory profiler @@ -263,8 +264,8 @@ def main(args): ) timer("fwd-bwd").stop() - if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool: - gpc.fstp_handler.clear_memory_pool() + if isp_communicator and isp_communicator.enable_memory_pool: + isp_communicator.memory_pool.reset_lazy_pools() # update parameters, and returns (success_update, grad_norm) trainer_result = trainer.step() From fe6fed722a5f2282d839147ab46ecb15ad42c7cf Mon Sep 17 00:00:00 2001 From: "chenxun.p" <759046501@qq.com> Date: Thu, 28 Dec 2023 11:11:24 +0800 Subject: [PATCH 092/153] feat(*): fix bug --- internlm/solver/optimizer/hybrid_zero_optim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 0f2cc1bf..414b402f 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -58,7 +58,7 @@ def __init__( grad_scal_cfg: Config = None, zero_cfg: Config = None, param_bcast_sync_handler: ParamBcastSyncHandler = None, - isp_communicator = None, + isp_communicator=None, ): # DynamicGradScaler related args if gpc.config.model.dtype is torch.float32: @@ -362,7 +362,7 @@ def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 # we should not only register for parameters which have isp_reduce_scatter_name attr. # we must keep up with reduce_grad_hook. - if self._isp_communicator is not None: + if self._isp_communicator and self._isp_communicator.overlap: accum_grad_obj.register_hook(accum_grad_hook) if self._overlap_sync_grad: From a80fbe3ab7ae65c7c6c602da99a520a31c7d544d Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 12 Jan 2024 11:18:00 +0800 Subject: [PATCH 093/153] fix(train/utils.py): fix zp size cheak and embed_param group --- internlm/core/context/parallel_context.py | 28 +++++++++++++++---- .../core/scheduler/no_pipeline_scheduler.py | 8 +++++- internlm/initialize/launch.py | 5 +++- internlm/model/overlap_handler.py | 2 ++ internlm/model/utils.py | 1 + internlm/train/utils.py | 6 ++-- train.py | 1 + 7 files changed, 41 insertions(+), 10 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 6e7efaae..56a7d715 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -410,17 +410,27 @@ def check_sanity(self): AssertionError: Raises an AssertionError if the world size does not equal to the product of data parallel size, pipeline parallel size and tensor parallel size. """ + # for mtp/msp/fsp dps = self.data_parallel_size pps = self.pipeline_parallel_size tps = self.tensor_parallel_size ws = self.world_size - # assert ws == dps * pps * tps, ( - # f"Expected the world size {ws} to be equal to data" - # f" parallel size ({dps}) * pipeline parallel size " - # f"({pps}) * tensor parallel size ({tps})" - # ) + assert ws == dps * pps * tps, ( + f"Expected the world size {ws} to be equal to data" + f" parallel size ({dps}) * pipeline parallel size " + f"({pps}) * tensor parallel size ({tps})" + ) + + # for isp + wps = self.weight_parallel_size + wdps = self.weight_data_parallel_size + assert ws == wdps * pps * wps, ( + f"Expected the world size {ws} to be equal to weight data" + f" parallel size ({wdps}) * pipeline parallel size " + f"({pps}) * weight parallel size ({wps})" + ) + assert self.zero1_parallel_size > 0 - assert self.data_parallel_size % self.zero1_parallel_size == 0 # check for fsdp: # if zo_size < dp_size, ckpts saving will introduce redundent storage for model weights @@ -470,10 +480,16 @@ def init_parallel_groups(self): assert ( self.zero1_parallel_size <= self.data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" + assert ( + self.data_parallel_size % self.zero1_parallel_size == 0 + ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" else: assert ( self.zero1_parallel_size <= self.weight_data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}" + assert ( + self.weight_data_parallel_size % self.zero1_parallel_size == 0 + ), f"weight_data_parallel_size:{self.weight_data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 56661d8c..6e8454ff 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -89,6 +89,7 @@ def _train_one_batch( engine: Engine, forward_only: bool = False, return_loss: bool = True, + return_output: bool = False, scale_loss: int = 1, ): """Trains one batch of data. @@ -100,6 +101,7 @@ def _train_one_batch( forward_only (bool, optional): If True, the model is run for the forward pass, else back propagation will be executed. return_loss (bool, optional): Loss will be returned if True. + return_output (bool, optional): Output will be returned if True. scale_loss (int, optional): The scale factor for the loss. """ @@ -128,6 +130,10 @@ def _train_one_batch( loss /= scale_loss loss += moe_loss + # clear output before backward for releasing memory resource + if not return_output: + output = None + # backward if not forward_only: self._call_hooks("before_backward", None, None) @@ -192,7 +198,7 @@ def forward_backward_step( _data, _label = self._load_accum_batch(data, label) _output, _loss, _moe_loss = self._train_one_batch( - _data, _label, engine, forward_only, return_loss, self._grad_accum_size + _data, _label, engine, forward_only, return_loss, return_output_label, self._grad_accum_size ) if return_loss: diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index af4c9698..9d6ab323 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -450,7 +450,10 @@ def launch( ) print( - f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} weight_dp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}", + f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} " + f"sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} pp_rank:{gpc.get_local_rank(ParallelMode.PIPELINE)} " + f"zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} " + f"wdp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}", flush=True, ) diff --git a/internlm/model/overlap_handler.py b/internlm/model/overlap_handler.py index a5649eae..2595d04e 100644 --- a/internlm/model/overlap_handler.py +++ b/internlm/model/overlap_handler.py @@ -386,6 +386,8 @@ def before_backward(self, scheduler, outputs, outputs_grad) -> None: def after_backward(self, scheduler, inputs_grad) -> None: self._zero_optim.accumulate_left_grads_after_backward() + if gpc.fstp_handler is not None and gpc.fstp_handler.enable_memory_pool: + gpc.fstp_handler.clear_memory_pool() def post_helper_func(self, scheduler, outputs, label) -> None: pass diff --git a/internlm/model/utils.py b/internlm/model/utils.py index a4fe3378..fdd457c2 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -34,6 +34,7 @@ def _split(input_, parallel_mode, dim=-1): tensor_list = torch.split(input_, dim_size // world_size, dim=dim) rank = gpc.get_local_rank(parallel_mode) output = tensor_list[rank].contiguous() + output = output.detach().clone() return output diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 7ef0cb81..58880bb8 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -138,9 +138,11 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy pgroup["params"] = origin_params pgroup["optimizer_mode"] = ParallelMode.ZERO1 - # param groups may contain empty groups, such as fp32 - if len(new_groups["embed_head"]["params"]) > 0: + # param groups may contain empty groups, such as embed_head + if gpc.config.parallel.tensor.mode == "isp": param_groups.extend(new_groups.values()) + else: + assert len(new_groups["embed_head"]["params"]) <= 0 # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) diff --git a/train.py b/train.py index b76c100d..6eabd8f8 100644 --- a/train.py +++ b/train.py @@ -204,6 +204,7 @@ def main(args): optimizer.optim, log_folder=f"memory_trace/rank{gpc.get_global_rank()}_" + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" + + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_" + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}", ) else: From 1aebcd99a1793d486d9be3ef28c9cf8d7d0f5c16 Mon Sep 17 00:00:00 2001 From: "chenxun.p" <759046501@qq.com> Date: Fri, 12 Jan 2024 16:07:14 +0800 Subject: [PATCH 094/153] fix(model/util): force to pass communictor --- internlm/model/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 25e84804..9b82039b 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -501,8 +501,8 @@ def forward( weight, bias, module, + communicator, return_residual=False, - communicator=None, use_flash_attn: bool = True, ): ctx.compute_weight_gradient = weight.requires_grad @@ -680,10 +680,10 @@ def megatron_fused_dense_func_torch( def isp_fused_dense_func( x: Tensor, weight: Tensor, + module, + communicator, bias: Optional[Tensor] = None, return_residual: bool = False, - module=None, - communicator=None, ): dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() @@ -708,7 +708,8 @@ def try_import_RMSNorm(): """ try: - from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as RMSNorm + from apex.normalization.fused_layer_norm import \ + MixedFusedRMSNorm as RMSNorm return RMSNorm except ModuleNotFoundError: From 917ab0d214d8199d647e8cb725f2e014acf3af2c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 12 Jan 2024 17:57:55 +0800 Subject: [PATCH 095/153] fix(model/utils.py): fix param set --- internlm/model/utils.py | 7 +++---- internlm/solver/optimizer/hybrid_zero_optim.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 9b82039b..1e6d76b0 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -689,15 +689,15 @@ def isp_fused_dense_func( x.dtype == torch.float32 and torch.is_autocast_enabled() ) if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return ISPFusedDenseFunc.apply(x, weight, bias, module, return_residual, communicator) + return ISPFusedDenseFunc.apply(x, weight, bias, module, communicator, return_residual) else: return ISPFusedDenseFunc.apply( x, weight, bias, module, - return_residual, communicator, + return_residual, use_flash_attn=False, ) @@ -708,8 +708,7 @@ def try_import_RMSNorm(): """ try: - from apex.normalization.fused_layer_norm import \ - MixedFusedRMSNorm as RMSNorm + from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as RMSNorm return RMSNorm except ModuleNotFoundError: diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 414b402f..3e3f5085 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -371,7 +371,7 @@ def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 _define_and_attach(param, reduce_rank) def accumulate_left_grads_after_backward(self): - if self._isp_communicator is None: + if self._isp_communicator is None or self._isp_communicator.overlap is False: return for group_id in range(self.num_param_groups): From b77787f445b64242fa4c8429450b5889ac9636b0 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 12 Jan 2024 18:52:47 +0800 Subject: [PATCH 096/153] fix(hybrid_zero_optim.py): fix reduce scatter error when wp_size=1 --- internlm/core/communication/isp.py | 3 ++- internlm/core/context/parallel_context.py | 7 ++++--- internlm/initialize/launch.py | 1 - internlm/model/linear.py | 2 +- internlm/model/multi_head_attention.py | 1 - internlm/solver/optimizer/hybrid_zero_optim.py | 6 +++++- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index cf19fbe5..eee38c5b 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -23,6 +23,7 @@ class ISPCommModelConfig: """ model config for isp communicator. """ + hidden_size: int = 0 mlp_ratio: float = 0 dtype: torch.dtype = torch.half @@ -435,7 +436,7 @@ def reduce_scatter( is_bias: bool = False, ): if dist.get_world_size(self.process_group) <= 1: - return tensor + return tensor, None if not self.overlap: result, handle = reduce_scatter_raw(tensor, self.process_group, op=op, async_op=True) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 56a7d715..826b51a1 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -487,9 +487,10 @@ def init_parallel_groups(self): assert ( self.zero1_parallel_size <= self.weight_data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}" - assert ( - self.weight_data_parallel_size % self.zero1_parallel_size == 0 - ), f"weight_data_parallel_size:{self.weight_data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" + assert self.weight_data_parallel_size % self.zero1_parallel_size == 0, ( + f"weight_data_parallel_size:{self.weight_data_parallel_size} % " + f"zero1_parallel_size: {self.zero1_parallel_size} != 0" + ) # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 9d6ab323..eedb0e65 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -8,7 +8,6 @@ from typing import Dict, Union import torch -from torch.distributed import get_rank from internlm.core.context import Config from internlm.core.context import global_context as gpc diff --git a/internlm/model/linear.py b/internlm/model/linear.py index d475214f..ed21a21b 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -364,9 +364,9 @@ def forward(self, x): return isp_fused_dense_func( x, self.weight, - self.bias, module=self, communicator=self.__communicator, + bias=self.bias, ) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index fb0309a5..eba5a6f1 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -35,7 +35,6 @@ from torch import Tensor, nn from torch.nn import Module -from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.model.embedding import DynamicNTKScalingRotaryEmbedding, RotaryEmbedding from internlm.model.linear import get_linear_cls diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 3e3f5085..f7ce3bdc 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -362,7 +362,11 @@ def extra_layernorm_reduce_grad_hook(*args): # pylint: disable=W0613 # we should not only register for parameters which have isp_reduce_scatter_name attr. # we must keep up with reduce_grad_hook. - if self._isp_communicator and self._isp_communicator.overlap: + if ( + self._isp_communicator + and self._isp_communicator.overlap + and gpc.config.parallel.weight.size > 1 + ): accum_grad_obj.register_hook(accum_grad_hook) if self._overlap_sync_grad: From 594d61db6c474a8e978074644b7000486da04f2d Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 15 Jan 2024 15:38:36 +0800 Subject: [PATCH 097/153] feat(model_checkpoint.py): model and optimizer save/load ckpt adapt to isp --- internlm/utils/model_checkpoint.py | 145 ++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 44 deletions(-) diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 4b3f7d5b..fa9f50df 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -254,9 +254,13 @@ def save_model_checkpoint(folder, model): - folder - model_tp{tp_rank}_pp{pp_rank}.pt + If tensor parallel mode is isp, the saved weight is named: + - folder + - model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt + If fsdp is activated, the saved weight is named: - folder - - model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank} + - model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt If the tp is inconsistent with the saved one in the future use, the weight needs to be converted before loading. @@ -277,39 +281,54 @@ def save_model_checkpoint(folder, model): if folder is not None: dp_size = gpc.get_world_size(ParallelMode.DATA) tp_size = gpc.get_world_size(ParallelMode.TENSOR) + wp_size = gpc.get_world_size(ParallelMode.WEIGHT) dp_rank = gpc.get_local_rank(ParallelMode.DATA) tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) + wdp_rank = gpc.get_local_rank(ParallelMode.WEIGHT_DATA) # TODO In theory, we should also consider pp level, but since pp is generally a state across machines, # even if pp is not considered, it will definitely not be written on the same machine. - should_save_rank_pair = set() # (tp_rank, dp_rank) - for i in range(tp_size): - if gpc.config.parallel.zero1.fsdp: - for j in range(dp_size): - should_save_rank_pair.add((i, j)) - else: - should_save_rank_pair.add((i, i % dp_size)) - if (tp_rank, dp_rank) in should_save_rank_pair: - f_dp = f"_dp{dp_rank}" if gpc.config.parallel.zero1.fsdp else "" - fn = f"model_tp{tp_rank}_pp{pp_rank}{f_dp}.pt" + # for tensor parallel mode with isp + if gpc.config.parallel.tensor.mode == "isp": + if wdp_rank == 0 or dp_rank == 0: + fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" fp = os.path.join(folder, fn) llm_save(fp, saved_obj=states) - if not gpc.config.parallel.zero1.fsdp or dp_rank == tp_rank % dp_size: - topo_fn = f"topo_tp{tp_rank}_pp{pp_rank}.json" - topo_fp = os.path.join(folder, topo_fn) - llm_save(topo_fp, saved_obj=topo) + topo_fn = f"topo_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.json" + topo_fp = os.path.join(folder, topo_fn) + llm_save(topo_fp, saved_obj=topo) + else: + # for tensor parallel mode with mtp/msp/fsp + should_save_rank_pair = set() # (tp_rank, dp_rank) + for i in range(tp_size): + if gpc.config.parallel.zero1.fsdp: + for j in range(dp_size): + should_save_rank_pair.add((i, j)) + else: + should_save_rank_pair.add((i, i % dp_size)) + + if (tp_rank, dp_rank) in should_save_rank_pair: + f_dp = f"_dp{dp_rank}" if gpc.config.parallel.zero1.fsdp else "" + fn = f"model_tp{tp_rank}_pp{pp_rank}{f_dp}.pt" + fp = os.path.join(folder, fn) + llm_save(fp, saved_obj=states) + if not gpc.config.parallel.zero1.fsdp or dp_rank == tp_rank % dp_size: + topo_fn = f"topo_tp{tp_rank}_pp{pp_rank}.json" + topo_fp = os.path.join(folder, topo_fn) + llm_save(topo_fp, saved_obj=topo) # try to save expert parameter to separate files if model have moe layer - expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) - expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) - should_save_rank_pair.clear() - for i in range(tp_size): - should_save_rank_pair.add((i, i % expert_dp_size)) + # expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) + # expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) + # should_save_rank_pair.clear() + # for i in range(tp_size): + # should_save_rank_pair.add((i, i % expert_dp_size)) - if (tp_rank, expert_dp_rank) in should_save_rank_pair: - try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) + # if (tp_rank, expert_dp_rank) in should_save_rank_pair: + # try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) torch.distributed.barrier() @@ -328,9 +347,11 @@ def load_model_checkpoint(folder, model): """ tp_size = gpc.get_world_size(ParallelMode.TENSOR) + wp_size = gpc.get_world_size(ParallelMode.WEIGHT) pp_size = gpc.get_world_size(ParallelMode.PIPELINE) dp_size = gpc.get_world_size(ParallelMode.DATA) tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) dp_rank = gpc.get_local_rank(ParallelMode.DATA) @@ -342,11 +363,15 @@ def load_model_checkpoint(folder, model): "_dp" not in test_fn and not gpc.config.parallel.zero1.fsdp ), "FSDP model wants to load no-FSDP ckpts or reverse" - max_pp, max_tp, max_zo = 0, 0, 0 + max_pp, max_wp, max_tp, max_zo = 0, 0, 0, 0 for fn in fns: if fn.startswith("model_t") and not fn.endswith(".md5"): segements = os.path.splitext(fn)[0].split("_") - if gpc.config.parallel.zero1.fsdp: + if gpc.config.parallel.tensor.mode == "isp": + max_pp = max(max_pp, int(segements[-1][2:])) + max_wp = max(max_wp, int(segements[-2][2:])) + max_tp = max(max_tp, int(segements[-3][2:])) + elif gpc.config.parallel.zero1.fsdp: max_zo = max(max_zo, int(segements[-1][2:])) max_pp = max(max_pp, int(segements[-2][2:])) max_tp = max(max_tp, int(segements[-3][2:])) @@ -357,6 +382,9 @@ def load_model_checkpoint(folder, model): assert ( pp_size == max_pp + 1 ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" + assert ( + wp_size == max_wp + 1 + ), f"The weights are save for {max_wp+1} parallelism, while current has {wp_size} weight parallelism" assert ( tp_size == max_tp + 1 ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism" @@ -365,7 +393,9 @@ def load_model_checkpoint(folder, model): dp_size == max_zo + 1 ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards" - if gpc.config.parallel.zero1.fsdp: + if gpc.config.parallel.tensor.mode == "isp": + should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" + elif gpc.config.parallel.zero1.fsdp: should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt" else: should_load_name = f"model_tp{tp_rank}_pp{pp_rank}.pt" @@ -466,20 +496,26 @@ def save_optimizer_checkpoint(optim, state_path): # TODO sanity check for optimizer type zero_rank = gpc.get_local_rank(ParallelMode.ZERO1) tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) + dp_rank = gpc.get_local_rank(ParallelMode.DATA) zero_size = gpc.get_world_size(ParallelMode.ZERO1) tp_size = gpc.get_world_size(ParallelMode.TENSOR) - pp_size = gpc.get_world_size(ParallelMode.PIPELINE) - fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt" + dp_size = gpc.get_world_size(ParallelMode.DATA) states = optim.state_dict() if isinstance(optim, HybridZeroOptimizer): - if gpc.get_global_rank() < zero_size * tp_size * pp_size: + if gpc.config.parallel.tensor.mode == "isp": + fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" llm_save(os.path.join(state_path, fp), states) - if "zero_devide_optim_plan" in states: - params_per_rank_id_dict = states.pop("zero_devide_optim_plan") - fp_meta = os.path.join(state_path, optim.rank_unique_id) - llm_save(fp_meta, params_per_rank_id_dict) + else: + fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt" + if (gpc.get_global_rank() % (tp_size * dp_size)) < zero_size * tp_size: + llm_save(os.path.join(state_path, fp), states) + if "zero_devide_optim_plan" in states: + params_per_rank_id_dict = states.pop("zero_devide_optim_plan") + fp_meta = os.path.join(state_path, optim.rank_unique_id) + llm_save(fp_meta, params_per_rank_id_dict) else: llm_save(os.path.join(state_path, fp), states) @@ -516,32 +552,53 @@ def load_optimizer_checkpoint(folder, optim): """ fns = get_fns(folder) - max_tp, max_pp, max_zero = 0, 0, 0 + max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0 for fn in fns: if fn.startswith("optimizer_") and not fn.endswith(".md5"): - _, tp, pp, zero = os.path.splitext(fn)[0].split("_") - max_zero = max(max_zero, int(zero[2:])) - max_tp = max(max_tp, int(tp[2:])) - max_pp = max(max_pp, int(pp[2:])) + if gpc.config.parallel.tensor.mode == "isp": + _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_") + max_dp = max(max_dp, int(dp[2:])) + max_tp = max(max_tp, int(tp[2:])) + max_wp = max(max_wp, int(wp[2:])) + max_pp = max(max_pp, int(pp[2:])) + else: + _, tp, pp, zero = os.path.splitext(fn)[0].split("_") + max_zero = max(max_zero, int(zero[2:])) + max_tp = max(max_tp, int(tp[2:])) + max_pp = max(max_pp, int(pp[2:])) zero_size = gpc.get_world_size(ParallelMode.ZERO1) - zero_rank = gpc.get_local_rank(ParallelMode.ZERO1) tp_size = gpc.get_world_size(ParallelMode.TENSOR) + wp_size = gpc.get_world_size(ParallelMode.WEIGHT) pp_size = gpc.get_world_size(ParallelMode.PIPELINE) + dp_size = gpc.get_world_size(ParallelMode.DATA) + assert ( + dp_size == max_dp + 1 + ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism" assert ( zero_size == max_zero + 1 - ), f"The weights are save for {max_zero+1} data parallel, while current has {zero_size} zero broadcast range." + ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range." assert ( pp_size == max_pp + 1 - ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" + ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" assert ( tp_size == max_tp + 1 - ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism" + ), f"The optimizer states are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism" + assert ( + wp_size == max_wp + 1 + ), f"The optimizer states are save for {max_wp+1} parallelism, while current has {wp_size} weight parallelism" + + zero_rank = gpc.get_local_rank(ParallelMode.ZERO1) + tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) + pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) + dp_rank = gpc.get_local_rank(ParallelMode.DATA) + if gpc.config.parallel.tensor.mode == "isp": + fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" + else: + fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt" - fp = f"optimizer_tp{gpc.get_local_rank(ParallelMode.TENSOR)}_" - fp += f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}_" - fp += f"zo{zero_rank}.pt" states = llm_load(os.path.join(folder, fp), map_location=get_current_device()) if isinstance(optim, HybridZeroOptimizer): From e4d1ff89a101e5fc9386436af7bce5ecd2b4b6fc Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 16 Jan 2024 15:55:36 +0800 Subject: [PATCH 098/153] fix(model_checkpoint.py): fix dp/zo size check --- internlm/utils/model_checkpoint.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index fa9f50df..5c21af90 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -573,12 +573,14 @@ def load_optimizer_checkpoint(folder, optim): pp_size = gpc.get_world_size(ParallelMode.PIPELINE) dp_size = gpc.get_world_size(ParallelMode.DATA) - assert ( - dp_size == max_dp + 1 - ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism" - assert ( - zero_size == max_zero + 1 - ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range." + if gpc.config.parallel.tensor.mode == "isp": + assert ( + dp_size == max_dp + 1 + ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism" + if gpc.config.parallel.tensor.mode != "isp": + assert ( + zero_size == max_zero + 1 + ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range." assert ( pp_size == max_pp + 1 ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" From f2f88a773d5f95ee01a3234efc3c3c4ab9a3f4e4 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Wed, 27 Dec 2023 12:03:30 +0800 Subject: [PATCH 099/153] support sequence parallel for moe --- .../core/scheduler/no_pipeline_scheduler.py | 6 +++ internlm/core/scheduler/pipeline_scheduler.py | 11 ++-- internlm/model/modeling_moe.py | 52 ++++++++++++------- internlm/moe/sharded_moe.py | 4 +- 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index 6e8454ff..2092548d 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -6,7 +6,9 @@ from typing import Any, Callable, Iterable, List, Optional import torch +import torch.distributed as dist +from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.engine import Engine from internlm.utils.common import conditional_context @@ -126,6 +128,10 @@ def _train_one_batch( if hasattr(gpc.config.model, "num_experts") else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype")) ) + # the moe_loss is computed among the "tensor" group if sequence parallel is enabled, + # so we need to do allreduce + if gpc.config.parallel.sequence_parallel: + dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR)) moe_loss /= scale_loss loss /= scale_loss loss += moe_loss diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index 622c91f6..7d8fd3fd 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -133,10 +133,7 @@ def __init__( tensor_shape if tensor_shape is None or isinstance(tensor_shape, torch.Size) else torch.Size(tensor_shape) ) - self.scatter_gather_tensors = ( - scatter_gather_tensors - and gpc.is_using_parallel_mode(ParallelMode.TENSOR) - ) + self.scatter_gather_tensors = scatter_gather_tensors and gpc.is_using_parallel_mode(ParallelMode.TENSOR) if gpc.config.parallel.sequence_parallel: self.scatter_gather_tensors = False @@ -293,6 +290,9 @@ def _forward_step( if hasattr(gpc.config.model, "num_experts") else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype")) ) + # the moe_loss is computed among the "tensor" group if sequence parallel is enabled, so we need to do allreduce + if gpc.config.parallel.sequence_parallel: + dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR)) moe_loss /= self.num_microbatches accum_moe_loss.add_(moe_loss.detach()) @@ -840,6 +840,9 @@ def _forward_step(self, engine, chunk_id): if hasattr(gpc.config.model, "num_experts") else torch.tensor(0.0, device=torch.cuda.current_device(), dtype=gpc.config.model.get("dtype")) ) + # the moe_loss is computed among the "tensor" group if sequence parallel is enabled, so we need to do allreduce + if gpc.config.parallel.sequence_parallel: + dist.all_reduce(moe_loss, op=dist.ReduceOp.AVG, group=gpc.get_group(ParallelMode.TENSOR)) moe_loss /= self.num_microbatches if self._accum_moe_loss is not None: diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index 9d9f3238..6c8f414d 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -15,13 +15,18 @@ from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal from internlm.model.embedding import Embedding1D from internlm.model.linear import ( - FeedForward, + MegatronScaleColumnParallelLinear, RewardModelLinear, ScaleColumnParallelLinear, + get_mlp_cls, ) from internlm.model.moe import MoE from internlm.model.multi_head_attention import MHA -from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm +from internlm.model.utils import ( + gather_forward_split_backward, + split_forward_gather_backward, + try_import_RMSNorm, +) from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs @@ -94,6 +99,7 @@ def __init__( moe_drop_tokens: bool = True, moe_use_rts: bool = True, moe_use_residual: bool = False, + tp_mode: str = "mtp", ): super().__init__() self.checkpoint = checkpoint @@ -103,10 +109,13 @@ def __init__( self.use_flash_attn = use_flash_attn head_dim = hidden_size // num_attention_heads + self.tp_mode = tp_mode + parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR self.mixer = MHA( embed_dim=hidden_size, num_heads=num_attention_heads, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), + sequence_process_group=gpc.get_group(ParallelMode.TENSOR), dropout=attn_drop_rate, max_position_embeddings=max_position_embeddings, softmax_scale=1 / math.sqrt(head_dim), @@ -118,6 +127,7 @@ def __init__( use_flash_attn=use_flash_attn, device=device, dtype=dtype, + tp_mode=self.tp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -147,11 +157,12 @@ def __init__( ep_size = gpc.get_world_size(ParallelMode.EXPERT) if num_experts <= 1: # dense, not MoE if use_swiglu: - self.mlp = FeedForward( + mlp_cls = get_mlp_cls(self.tp_mode) + self.mlp = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), bias=False, device=device, dtype=dtype, @@ -162,7 +173,7 @@ def __init__( int(hidden_size * mlp_ratio), out_features=hidden_size, activation="gelu_approx", - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), bias1=False, bias2=False, sequence_parallel=gpc.config.model.sequence_parallel, @@ -171,9 +182,6 @@ def __init__( device=device, dtype=dtype, ) - for _, param in self.mlp.named_parameters(): - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) else: # replace mlp by MoE module. The expert in MoE is a FeedForward module. self.mlp = MoE( @@ -191,9 +199,6 @@ def __init__( device=device, dtype=dtype, ) - for _, param in self.mlp.moe_layer.experts.named_parameters(): - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) set_fp32_attr_to_module(self.mlp.moe_layer.gate) self.dropout2 = nn.Dropout(drop_rate) @@ -374,11 +379,16 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) + self.tp_mode = gpc.config.parallel.tensor.mode if is_reward: head_cls = RewardModelLinear else: - head_cls = ScaleColumnParallelLinear + head_cls = ( + ScaleColumnParallelLinear + if self.tp_mode in ["mtp", "fsp", "isp"] + else MegatronScaleColumnParallelLinear + ) if first: if embed_split_hidden: self.embedding = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -395,8 +405,6 @@ def __init__( ) for _, param in self.embedding.named_parameters(): normal_(std=0.0052)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) self.embed_grad_scale = embed_grad_scale self.blocks = nn.ModuleList( [ @@ -428,6 +436,7 @@ def __init__( moe_drop_tokens=moe_drop_tokens, moe_use_rts=moe_use_rts, moe_use_residual=moe_use_residual, + tp_mode=self.tp_mode, ) for lid in range(num_layers) ] @@ -449,8 +458,7 @@ def __init__( ) for _, param in self.head.named_parameters(): normal_(std=0.0052)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) + self.parallel_output = parallel_output def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None): @@ -475,6 +483,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] + # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension. + if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp": + indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None moe_losses = [] @@ -491,7 +503,11 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N if hasattr(self, "norm"): hidden_states = self.norm(hidden_states.float()) if hasattr(self, "head"): - hidden_states = self.head(hidden_states) + # Evaluation + if hidden_states.ndim == 3: + hidden_states = self.head(hidden_states, gather_dim=1) + else: # Training + hidden_states = self.head(hidden_states, gather_dim=0) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py index dbee2a49..ac7613a2 100644 --- a/internlm/moe/sharded_moe.py +++ b/internlm/moe/sharded_moe.py @@ -12,6 +12,8 @@ from torch import Tensor from torch.nn import Module +from internlm.core.context import ParallelMode +from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer @@ -189,7 +191,7 @@ def top1gating( # if we don't want to drop any tokens if not drop_tokens: new_capacity = torch.max(exp_counts).to(logits.device) - dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group()) + dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL)) capacity = new_capacity # Compute l_aux From 6e012b148ee036471fd15b241bd1dd21e03b9efa Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Wed, 17 Jan 2024 11:24:02 +0800 Subject: [PATCH 100/153] modify expert groups --- .../core/context/process_group_initializer.py | 76 +++---------------- 1 file changed, 11 insertions(+), 65 deletions(-) diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py index 5e59df22..dcf429d3 100644 --- a/internlm/core/context/process_group_initializer.py +++ b/internlm/core/context/process_group_initializer.py @@ -532,67 +532,6 @@ def init_dist_group(self, use_cpu: bool = False): return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode -class Initializer_Expert(ProcessGroupInitializer): - """A ProcessGroupInitializer for expert parallelism. - - Args: - rank (int): The rank of current process. - world_size (int): Size of whole communication world. - data_parallel_size (int): Size of data parallel. - pipeline_parallel_size (int): Size of pipeline parallel. - tensor_parallel_size (int): Size of tensor parallel. - zero1_parallel_size (int): Size of zero-1 parallel. - expert_parallel_size (int): Size of expert parallel. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.num_expert_parallel_group = self.world_size // self.expert_parallel_size - - assert self.world_size % self.num_expert_parallel_group == 0 - - # TODO: to match expert parallel with differnt data parallel size - assert self.data_parallel_size == self.expert_parallel_size - - def init_dist_group(self, use_cpu: bool = False): - """Initialize expert parallel groups, and assign local_ranks and groups to each gpu. - - Example: world_size = 8, model_parallel_size = 2, expert_parallel_size = 4 - model_parallel_group = [0,1], [2,3], [4,5], [6,7] - expert_parallel_group = [0,2,4,6], [1,3,5,7] - - Returns: - Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): - A expert parallelism's information tuple. - """ - local_rank = None - ranks_in_group = None - process_group = None - cpu_group = None - group_world_size = None - mode = ParallelMode.EXPERT - - for i in range(self.num_expert_parallel_group): - ranks = list(range(i, self.world_size, self.num_expert_parallel_group)) - group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) - if use_cpu: - group_cpu = ( - dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) - if dist.get_backend() != "gloo" - else group - ) - else: - group_cpu = None - if self.rank in ranks: - local_rank = ranks.index(self.rank) - group_world_size = len(ranks) - process_group = group - cpu_group = group_cpu - ranks_in_group = ranks - - return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode - - class Initializer_Expert_Data(ProcessGroupInitializer): """A ProcessGroupInitializer for expert data parallelism. @@ -608,7 +547,9 @@ class Initializer_Expert_Data(ProcessGroupInitializer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.num_expert_parallel_group = self.world_size // self.expert_parallel_size + + self.ranks_num_per_pp = self.world_size // self.pipeline_parallel_size + assert self.data_parallel_size % self.expert_parallel_size == 0 def _get_expert_parallel_ranks(self): """ @@ -620,9 +561,14 @@ def _get_expert_parallel_ranks(self): expert_data_parallel_group = [0,4], [2,6], [1,5], [3,7] """ data_parallel_groups = [] - model_parallel_size = self.pipeline_parallel_size * self.tensor_parallel_size - for i in range(model_parallel_size): - data_parallel_groups.append(list(range(i, self.world_size, model_parallel_size))) + for i in range(self.pipeline_parallel_size): + for j in range(self.sequence_parallel_size): + data_parallel_groups.append( + [ + i * self.ranks_num_per_pp + j + k * self.sequence_parallel_size + for k in range(self.data_parallel_size) + ] + ) expert_parallel_groups = [] expert_data_parallel_groups = [] From 18e6e78e16a62e140fd7b0179acff61c3f1f86c4 Mon Sep 17 00:00:00 2001 From: "chenxun.p" <759046501@qq.com> Date: Wed, 17 Jan 2024 16:15:34 +0800 Subject: [PATCH 101/153] feat(isp): support interleaved pipeline parallel scheduler --- internlm/core/communication/isp.py | 127 ++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 39 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index eee38c5b..e048b623 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from functools import partial -from typing import Dict, List, Union +from typing import Any, Dict, List, Union import torch from torch import distributed as dist @@ -141,6 +141,22 @@ def reset_lazy_pools(self) -> None: self._reduce_scatter_memory_pool = {} +class ISPOverlapState: + def __init__(self) -> None: + self.num_blocks: int = 0 + self.embedding: List[nn.Module] = [] + self.head: List[nn.Module] = [] + self.last_block: nn.Moudle = None + self.isp_outs: List[nn.Module] = [] + self.isp_modules: List[nn.Module] = [] + self.index_to_isp_module: Dict[int, nn.Module] = {} + self.module_to_index: Dict[nn.Module, int] = {} + self.weight_global_handle: Dict[str, Any] = {} + self.weight_global_output: Dict[str, torch.Tensor] = {} + self.bias_global_handle: Dict[str, Any] = {} + self.bias_global_output: Dict[str, torch.Tensor] = {} + + class ISPCommunicator: """ ISP Communicator for managing the all-gather and reduce_scatter of Intern Sequence Parallel. @@ -160,72 +176,83 @@ def __init__( self.overlap = overlap self.enable_memory_pool = overlap and enable_memory_pool self.model_conf = model_conf + self.module_name = model_conf.modules.copy() self.is_forward = True + self.reduce_scatter_handlers = {} - self._isp_outs = [] - self._isp_modules = [] - self._module_name = model_conf.modules.copy() + # real overlap state for each chunk. + self._overlap_states: Dict[int, ISPOverlapState] = {} + # inner interface variables of overlap state. + self._num_blocks = None + self._head = None + self._embedding = None + self._last_block = None + self._isp_outs = None + self._isp_modules = None # key: isp module; value: module global all-gather op handle - self._weight_global_handle = {} + self._weight_global_handle = None # key: isp module; value: module bias global all-gather op handle - self._bias_global_handle = {} - self.reduce_scatter_handlers = {} + self._bias_global_handle = None # key: isp module; value: module global weight after all-gather op - self._weight_global_output = {} + self._weight_global_output = None # key: isp module; value: module bias global weight after all-gather op - self._bias_global_output = {} + self._bias_global_output = None # key: isp module; value: transformer block index - self._module_to_index = {} + self._module_to_index = None # key: transformer block index; value: isp modules - self._index_to_isp_module = {} - self._last_block = None - self._head = [] - self._embedding = [] - - # just want to share same for loop for ModuleList and Module - model = model if isinstance(model, nn.ModuleList) else [model] - for chunk in model: - if isinstance(chunk, NaiveAMPModel): - chunk = chunk.model - self._parse_model_structure(chunk) - - self.num_blocks = len(self._index_to_isp_module) + self._index_to_isp_module = None + # init memory pool if necessary. if self.enable_memory_pool: self.memory_pool = MemoryPool(model_conf) else: self.memory_pool = None + # init overlap states if necessary. if self.overlap: - self._register_sync_parameters_hook() + # just want to share same for loop for modulelist and module. + model = model if isinstance(model, nn.ModuleList) else [model] + # build overlap states for every chunk. + for chunk_id, chunk in enumerate(model): + if isinstance(chunk, NaiveAMPModel): + chunk = chunk.model + self._parse_model_structure(chunk_id, chunk) + # register overlap hooks for every chunk. + for chunk_id in range(len(model)): + self.switch_current_model_chunk(chunk_id) + self._register_sync_parameters_hook() + # switch to chunk 0 at first. + self.switch_current_model_chunk(0) + + def _parse_model_structure(self, cid: int, model: nn.Module) -> None: + self._overlap_states[cid] = ISPOverlapState() - def _parse_model_structure(self, model: nn.Module) -> None: # Important: only works for llama-class models - for chunk_name, children in model.named_children(): + for _, children in model.named_children(): if isinstance(children, ScaleColumnParallelLinear): setattr(children, "isp_name", "head") - self._head.append(children) + self._overlap_states[cid].head.append(children) elif isinstance(children, Embedding1D): - self._embedding.append(children) + self._overlap_states[cid].embedding.append(children) elif isinstance(children, nn.ModuleList): - self._last_block = children[-1] + self._overlap_states[cid].last_block = children[-1] for idx, block in enumerate(children): - self._index_to_isp_module[idx] = [] + self._overlap_states[cid].index_to_isp_module[idx] = [] for sub_name, sub in block.named_children(): for name, child in sub.named_children(): if name == "out_proj": - self._isp_outs.append(child) - self._module_to_index[child] = idx + self._overlap_states[cid].isp_outs.append(child) + self._overlap_states[cid].module_to_index[child] = idx if isinstance(child, ISPLinear): - self._module_to_index[child] = idx - self._isp_modules.append(child) - self._index_to_isp_module[idx].append(child) + self._overlap_states[cid].module_to_index[child] = idx + self._overlap_states[cid].isp_modules.append(child) + self._overlap_states[cid].index_to_isp_module[idx].append(child) setattr(child, "isp_name", name) - full_name = f"{chunk_name}.{idx}.{sub_name}.{name}" + full_name = f"{cid}.{idx}.{sub_name}.{name}" setattr( child.weight, "isp_reduce_scatter_name", @@ -238,6 +265,8 @@ def _parse_model_structure(self, model: nn.Module) -> None: f"{full_name}.bias", ) + self._overlap_states[cid].num_blocks = len(self._overlap_states[cid].index_to_isp_module) + def _all_gather_module_weight(self, module): with_bias = module.bias is not None block_index = self._module_to_index[module] @@ -319,7 +348,7 @@ def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args): # pylint: d self._all_gather_block_weight(block_index - 1) else: # start the all-gather for next block - if block_index + 1 < self.num_blocks: + if block_index + 1 < self._num_blocks: self._all_gather_block_weight(block_index + 1) def _pre_forward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 @@ -329,7 +358,7 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args): # pylint: dis self._wait_handle(module) def _pre_forward_hook_for_block(self, *args): # pylint: disable=W0613 - for module in self._index_to_isp_module[self.num_blocks - 1]: + for module in self._index_to_isp_module[self._num_blocks - 1]: self._all_gather_module_weight(module) self._wait_handle(module) @@ -343,7 +372,7 @@ def _post_backward_hook_for_head(self, *args): # pylint: disable=W0613 def _pre_backward_hook_for_head(self, *args): # pylint: disable=W0613 if self.is_forward is False: - self._all_gather_block_weight(self.num_blocks - 1) + self._all_gather_block_weight(self._num_blocks - 1) def _pre_backward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 # wait handle for current module @@ -413,6 +442,20 @@ def _get_constant_zero(self, size: tuple) -> torch.Tensor: device=self.model_conf.device, ).contiguous() + def switch_current_model_chunk(self, chunk_id: int) -> None: + self._isp_outs = self._overlap_states[chunk_id].isp_outs + self._isp_modules = self._overlap_states[chunk_id].isp_modules + self._weight_global_handle = self._overlap_states[chunk_id].weight_global_handle + self._bias_global_handle = self._overlap_states[chunk_id].bias_global_handle + self._weight_global_output = self._overlap_states[chunk_id].weight_global_output + self._bias_global_output = self._overlap_states[chunk_id].bias_global_output + self._module_to_index = self._overlap_states[chunk_id].module_to_index + self._index_to_isp_module = self._overlap_states[chunk_id].index_to_isp_module + self._last_block = self._overlap_states[chunk_id].last_block + self._head = self._overlap_states[chunk_id].head + self._embedding = self._overlap_states[chunk_id].embedding + self._num_blocks = self._overlap_states[chunk_id].num_blocks + # communication operation interfaces def all_gather(self, tensor: torch.Tensor, module: nn.Module, is_bias: bool = False): @@ -481,6 +524,9 @@ def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None: def before_forward(self, scheduler, inputs) -> None: if self._isp_communicator.model_checkpoint: self._isp_communicator.is_forward = True + # switch model chunk before forward + chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank + self._isp_communicator.switch_current_model_chunk(chunk_id) def after_forward(self, scheduler, outputs) -> None: pass @@ -494,6 +540,9 @@ def after_criterion(self, scheduler, loss) -> None: def before_backward(self, scheduler, outputs, outputs_grad) -> None: if self._isp_communicator.model_checkpoint: self._isp_communicator.is_forward = False + # switch model chunk before forward + chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank + self._isp_communicator.switch_current_model_chunk(chunk_id) def after_backward(self, scheduler, inputs_grad) -> None: # accumulate left gradients in last bucket after backward. From 55ebba08bfc263ca4b19e0c32f5dd5ab9b45ab16 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Wed, 17 Jan 2024 17:08:00 +0800 Subject: [PATCH 102/153] add moe group --- internlm/core/context/__init__.py | 6 +- internlm/core/context/parallel_context.py | 2 +- .../core/context/process_group_initializer.py | 2 +- internlm/initialize/launch.py | 2 +- internlm/model/utils.py | 12 --- .../solver/optimizer/hybrid_zero_optim.py | 36 +++---- internlm/solver/optimizer/utils.py | 6 +- internlm/train/training_internlm.py | 63 +++++++----- internlm/train/utils.py | 95 +++---------------- internlm/utils/parallel.py | 9 ++ 10 files changed, 88 insertions(+), 145 deletions(-) diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index 13da8f58..a306ad70 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -1,7 +1,8 @@ from .parallel_context import ( - IS_TENSOR_ZERO_PARALLEL, - IS_TENSOR_DATA_PARALLEL, IS_REPLICA_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_EXPERT_DATA_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, IS_WEIGHT_ZERO_PARALLEL, Config, ParallelContext, @@ -34,6 +35,7 @@ "IS_TENSOR_DATA_PARALLEL", "IS_REPLICA_ZERO_PARALLEL", "IS_WEIGHT_ZERO_PARALLEL", + "IS_TENSOR_EXPERT_DATA_PARALLEL", "global_context", "ParallelContext", "ParallelMode", diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 826b51a1..62f1e42d 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -24,13 +24,13 @@ from .process_group_initializer import ParallelMode from .random import add_seed, get_seeds, set_mode - IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel" # for isp, with optimizer split in dp group IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel" # for mtp/msp/fsp, with optimizer split in zero1 group IS_TENSOR_ZERO_PARALLEL = "is_tensor_zero_parallel" IS_WEIGHT_ZERO_PARALLEL = "is_weight_zero_parallel" +IS_TENSOR_EXPERT_DATA_PARALLEL = "is_tensor_expert_data_parallel" logger = get_logger(__file__) diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py index dcf429d3..b6e72527 100644 --- a/internlm/core/context/process_group_initializer.py +++ b/internlm/core/context/process_group_initializer.py @@ -554,7 +554,7 @@ def __init__(self, *args, **kwargs): def _get_expert_parallel_ranks(self): """ Create expert and data parallel groups - Example: world_size = 8, model_parallel_size = 2, expert_parallel_size = 2 + Example: world_size = 8, tensor_parallel_size = 2, expert_parallel_size = 2 model_parallel_group = [0,1], [2,3], [4,5], [6,7] data_parallel_group = [0,2,4,6], [1,3,5,7] expert_parallel_group = [0,2], [4,6], [1,3], [5,7] diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index eedb0e65..ee9f4d4a 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -377,7 +377,7 @@ def args_sanity_check(): assert ( not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param ), "not support overlap and moe at the same time" - assert gpc.config.parallel.zero1.size == -1, "moe only support zero1, set zero1=dict(size=-1,...) can fix this" + assert gpc.config.parallel.zero1.size == gpc.get_world_size(ParallelMode.DATA), "moe only support zero1" def launch( diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 1e6d76b0..48eb4b78 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -724,18 +724,6 @@ def is_moe_param(param: torch.Tensor) -> bool: return False -def is_gate_param(param: torch.Tensor) -> bool: - if hasattr(param, "is_gate") and param.is_gate: - return True - return False - - -def is_norm_param(param: torch.Tensor) -> bool: - if hasattr(param, "is_norm") and param.is_norm: - return True - return False - - def Silu(w1_o, w2_o): return F.silu(w1_o) * w2_o diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index f7ce3bdc..9ef3aecf 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -13,8 +13,9 @@ from internlm.core.context import global_context as gpc from internlm.core.context.parallel_context import ( IS_TENSOR_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, + IS_TENSOR_EXPERT_DATA_PARALLEL, IS_TENSOR_ZERO_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, ) from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( @@ -167,6 +168,8 @@ def __init__( # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name self._broadcast_parallel_mode.append(zero_mode) + if self._is_moe_group(param_group): + grad_reduce_mode = ParallelMode.EXPERT_DATA if param_group["name"] != "embed_head" and self.use_isp: grad_reduce_mode = ParallelMode.WEIGHT_DATA else: @@ -288,12 +291,6 @@ def _partition_param_list(self, group_id, param_group): def _is_moe_group(self, param_group): return "moe" in param_group.keys() and param_group["moe"] - def _is_norm_group(self, param_group): - return "norm" in param_group.keys() and param_group["norm"] - - def _is_gate_group(self, param_group): - return "gate" in param_group.keys() and param_group["gate"] - # TODO check expert dp is correct when enable moe and overlap both def _attach_reduction_hook(self): # we iterate over the fp16 params @@ -619,17 +616,21 @@ def _compute_norm_with_stage( grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] - if group_id == 0: + if self.optim.param_groups[group_id]["name"] in ("default", "fp32"): for param in params: if self.use_isp: setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) else: setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif group_id == 1: + elif self.optim.param_groups[group_id]["name"] == "embed_head": + # should be isp mode for param in params: setattr(param, IS_TENSOR_DATA_PARALLEL, True) + elif self._is_moe_group(self.optim.param_groups[group_id]): + for param in params: + setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) else: - raise NotImplementedError("group_id > 1 is not yet implemented.") + raise NotImplementedError("unrecognized parameter group.") norm = 0 if self._clip_grad_norm > 0: @@ -652,6 +653,8 @@ def _compute_norm_with_stage( delattr(param, IS_TENSOR_ZERO_PARALLEL) if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): delattr(param, IS_WEIGHT_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL): + delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL) return norm @@ -830,19 +833,6 @@ def _step(self, closure=None, norms=None): param_shape == flat_fp32_avg_grads.shape ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}" - # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients. - # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors. - is_tp_sync_groups = ( - self._is_norm_group(self.optim.param_groups[group_id]), - self._is_gate_group(self.optim.param_groups[group_id]), - ) - if any(is_tp_sync_groups): - dist.all_reduce( - flat_fp32_avg_grads, - op=dist.ReduceOp.AVG, - group=gpc.get_group(ParallelMode.TENSOR), - ) - single_grad_partition_groups.append(flat_fp32_avg_grads) device = self._fp32_flat_param_groups_of_current_rank[group_id].device self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 42a9949f..184b715e 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -20,6 +20,7 @@ from internlm.utils.parallel import ( is_replica_zero_parallel_parameter, is_tensor_data_parallel_parameter, + is_tensor_expert_data_parallel_parameter, is_tensor_zero_parallel_parameter, is_weight_zero_parallel_parameter, ) @@ -255,6 +256,9 @@ def append_grad(g, p): elif is_weight_zero_parallel_parameter(p): # process all ranks for IS_WEIGHT_ZERO_PARALLEL parameter group append_grad(g, p) + elif is_tensor_expert_data_parallel_parameter(p): + # process all ranks for IS_TENSOR_EXPERT_DATA_PARALLEL parameter group + append_grad(g, p) elif gpc.get_local_rank(weight_parallel_mode) != 0: continue else: @@ -324,7 +328,7 @@ def compute_norm( """ Sum across all model-parallel GPUs. - 1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and + 1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and gradients along the pp+zero dimensions from all ranks should be aggregated. 2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated. 3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated. diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2822da5a..5d7b8926 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -20,6 +20,15 @@ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy from torch.utils.data import ConcatDataset, DataLoader +from internlm.core.communication.isp import ISPCommModelConfig, ISPCommunicator +from internlm.core.context import ( + IS_REPLICA_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_EXPERT_DATA_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + ParallelMode, +) from internlm.core.context import global_context as gpc from internlm.core.context.random import set_mode from internlm.core.naive_amp import NaiveAMPModel @@ -36,17 +45,17 @@ from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data from internlm.model.embedding import Embedding1D from internlm.model.linear import ( - FeedForward, - RewardModelLinear, - ScaleColumnParallelLinear, BaseScaleColumnParallelLinear, ColumnParallelLinear, + FeedForward, + ISPLinear, + RewardModelLinear, RowParallelLinear, + ScaleColumnParallelLinear, ) +from internlm.model.moe import MoE from internlm.model.multi_head_attention import MHA -from internlm.model.linear import ISPLinear -from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig -from internlm.model.utils import try_import_RMSNorm +from internlm.model.utils import is_moe_param, try_import_RMSNorm from internlm.monitor import send_heartbeat, set_env_var from internlm.monitor.monitor import monitor_manager as mm from internlm.solver.beta2_scheduler import Beta2Scheduler @@ -58,25 +67,17 @@ from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( + is_replica_zero_parallel_parameter, + is_tensor_data_parallel_parameter, + is_tensor_expert_data_parallel_parameter, + is_tensor_zero_parallel_parameter, + is_weight_zero_parallel_parameter, set_model_params_layer_name, sync_model_param, sync_model_replica_param_group, ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout -from internlm.core.context import ( - IS_TENSOR_ZERO_PARALLEL, - IS_REPLICA_ZERO_PARALLEL, - IS_TENSOR_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, - ParallelMode, -) -from internlm.utils.parallel import ( - is_replica_zero_parallel_parameter, - is_tensor_data_parallel_parameter, - is_tensor_zero_parallel_parameter, - is_weight_zero_parallel_parameter, -) RMSNorm = try_import_RMSNorm() logger = get_logger(__file__) @@ -89,10 +90,12 @@ def _check_module(module): for param in module.parameters(): setattr(param, IS_REPLICA_ZERO_PARALLEL, True) + if isinstance(module, MoE): + for param in module.moe_layer.gate.parameters(): + setattr(param, IS_REPLICA_ZERO_PARALLEL, True) + # embedding and head - if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance( - module, BaseScaleColumnParallelLinear - ): + if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)): for param in module.parameters(): if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp": setattr(param, IS_TENSOR_DATA_PARALLEL, True) @@ -102,9 +105,20 @@ def _check_module(module): # for linear module if isinstance(module, (ColumnParallelLinear, RowParallelLinear)): for param in module.parameters(): - if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp": + if gpc.is_initialized(ParallelMode.EXPERT_DATA) and is_moe_param(param): + # module should be MoE experts's linear + setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) + elif ( + not is_moe_param(param) + and gpc.is_initialized(ParallelMode.TENSOR) + and gpc.config.parallel.tensor.mode != "isp" + ): setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp": + elif ( + not is_moe_param(param) + and gpc.is_initialized(ParallelMode.WEIGHT) + and gpc.config.parallel.tensor.mode == "isp" + ): setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) if not isinstance(model, nn.ModuleList): @@ -123,6 +137,7 @@ def _check_module(module): or is_tensor_data_parallel_parameter(param) or is_tensor_zero_parallel_parameter(param) or is_weight_zero_parallel_parameter(param) + or is_tensor_expert_data_parallel_parameter(param) ), f"parameter with name:{name} has no parallel attribution." diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 58880bb8..cd9ed0ac 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -4,84 +4,10 @@ from internlm.core.context.parallel_context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc -from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param +from internlm.model.utils import is_moe_param from internlm.utils.parallel import is_tensor_data_parallel_parameter -def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]: - """Split parameters into different groups for optimizer - - Args: - param_groups (Tuple[Dict]): The list of parameter groups to split - Input Example: - >>> ( - >>> {'name': 'default', 'params': [tensor], 'weight_decay' :xxx}, - >>> ) - - Returns: - Tuple[Dict]: list of params groups for optimizer - Output Example: - >>> ( - >>> {'name': 'default','params': [tensor],'weight_decay' :xxx}, - >>> {'name': 'fp32', 'params': [tensor],'weight_decay' :xxx}, - >>> {'name': 'norm', 'norm': True, 'params': [tensor],'weight_decay' :xxx}, - >>> {'name': 'gate', 'gate': True, 'params': [tensor],'weight_decay' :xxx}, - >>> {'name': 'moe_ep_size_4', 'moe': True, 'params': [tensor],'weight_decay' :xxx}, - >>> ) - """ - - if isinstance(param_groups, tuple): - param_groups = list(param_groups) # Tuple cannot be modified - elif isinstance(param_groups, dict): - param_groups = [param_groups] - elif not isinstance(param_groups, list): - raise ValueError(f"Unknown param group type of {type(param_groups)}") - - # create new groups for fp32, norm, moe gate and moe expert - new_groups = {} - new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA} - if gpc.config.model.get("num_experts", 0) > 1: - # norm and gate are special group to force sync (when enable MoE). - for key in ["gate", "norm"]: - new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA} - for key in gpc.expert_parallel_group_names: - new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA} - - for pgroup in param_groups: - # copy attribute from origin group, we assume the input param_groups only - # have one group, so the attribute will not be copyed multiple times. - for ori_key in pgroup.keys(): - if ori_key not in ("name", "params"): - for _, group in new_groups.items(): - group[ori_key] = pgroup[ori_key] - # assign param - origin_params = [] - # first split the norm and gate groups, which are special case to force sync (when enable MoE), - # then fp32 group and the moe group. - for param in pgroup["params"]: - if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param): - new_groups["norm"]["params"].append(param) - # gate param means MoE is enabled - elif is_gate_param(param): - new_groups["gate"]["params"].append(param) - elif param.dtype == torch.float32: - new_groups["fp32"]["params"].append(param) - # moe param means MoE is enabled - elif is_moe_param(param): - new_groups[param.group_name]["params"].append(param) - else: - origin_params.append(param) - - # bf16 param group, which is the first group in the param groups - pgroup["params"] = origin_params - pgroup["dp_mode"] = ParallelMode.DATA - - # param groups may contain empty groups, such as fp32 - param_groups.extend(new_groups.values()) - - return tuple(param_groups) - - def split_params_into_different_groups_for_optimizer_with_new_partition_strategy( param_groups: Tuple[Dict], ) -> Tuple[Dict]: @@ -114,9 +40,15 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # create new groups for IS_TENSOR_DATA_PARALLEL parameter group new_groups = {} - new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} + if gpc.config.parallel.tensor.mode == "isp": + new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} + new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1} # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1} + if gpc.config.model.get("num_experts", 0) > 1: + for key in gpc.expert_parallel_group_names: + new_groups[key] = {"name": key, "moe": True, "params": [], "optimizer_mode": ParallelMode.EXPERT_DATA} + for pgroup in param_groups: # copy attribute from origin group, we assume the input param_groups only # have one group, so the attribute will not be copyed multiple times. @@ -128,9 +60,15 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy origin_params = [] for param in pgroup["params"]: if is_tensor_data_parallel_parameter(param): + # should not be here if not isp mode new_groups["embed_head"]["params"].append(param) # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True: # new_groups["layer_norm"]["params"].append(param) + elif param.dtype == torch.float32: + new_groups["fp32"]["params"].append(param) + # moe param means MoE is enabled + elif is_moe_param(param): + new_groups[param.group_name]["params"].append(param) else: origin_params.append(param) @@ -139,10 +77,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy pgroup["optimizer_mode"] = ParallelMode.ZERO1 # param groups may contain empty groups, such as embed_head - if gpc.config.parallel.tensor.mode == "isp": - param_groups.extend(new_groups.values()) - else: - assert len(new_groups["embed_head"]["params"]) <= 0 + param_groups.extend(new_groups.values()) # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index e354f3b2..e66612f8 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -7,6 +7,7 @@ from internlm.core.context import ( IS_REPLICA_ZERO_PARALLEL, IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_EXPERT_DATA_PARALLEL, IS_TENSOR_ZERO_PARALLEL, IS_WEIGHT_ZERO_PARALLEL, ParallelMode, @@ -46,6 +47,14 @@ def is_weight_zero_parallel_parameter(p): ) +def is_tensor_expert_data_parallel_parameter(p): + return ( + gpc.is_initialized(ParallelMode.TENSOR) + and hasattr(p, IS_TENSOR_EXPERT_DATA_PARALLEL) + and getattr(p, IS_TENSOR_EXPERT_DATA_PARALLEL) + ) + + def sync_model_param(model): r"""Make sure data parameters are consistent during Data Parallel Mode. From ab039d5b7bc0d5d4e8e0cbff97432fced146e2af Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 17 Jan 2024 17:22:59 +0800 Subject: [PATCH 103/153] fix(isp.py): fix comment --- internlm/core/communication/isp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index e048b623..ea628466 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -540,7 +540,7 @@ def after_criterion(self, scheduler, loss) -> None: def before_backward(self, scheduler, outputs, outputs_grad) -> None: if self._isp_communicator.model_checkpoint: self._isp_communicator.is_forward = False - # switch model chunk before forward + # switch model chunk before backward chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank self._isp_communicator.switch_current_model_chunk(chunk_id) From 8347ab49e8fe665fef96a637a1af6b2810531d62 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 17 Jan 2024 17:37:15 +0800 Subject: [PATCH 104/153] feat(model): remove useless debug print --- internlm/model/embedding.py | 4 ---- internlm/model/modeling_internlm.py | 12 ------------ internlm/model/multi_head_attention.py | 8 -------- internlm/train/utils.py | 9 --------- sort_log.py | 17 ----------------- 5 files changed, 50 deletions(-) delete mode 100644 sort_log.py diff --git a/internlm/model/embedding.py b/internlm/model/embedding.py index 11c71b2c..d1770538 100644 --- a/internlm/model/embedding.py +++ b/internlm/model/embedding.py @@ -59,10 +59,6 @@ def forward(self, input_: Tensor) -> Tensor: if gpc.config.parallel.sequence_parallel: output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1) - # print( - # f"ht debug embed: rank:{gpc.get_global_rank()} output.shape:{output.shape} output:{output}", - # flush=True, - # ) return output diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 032fef91..32c1c7b0 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -231,14 +231,7 @@ def _dropout_and_norm_ffn(_residual, _hidden_states): if self.residual_in_fp32: residual = residual.to(torch.float32) - # print( - # f"ht debug mlp rank:{gpc.get_global_rank()} input.shape:{hidden_states.shape} input:{hidden_states}", - # flush=True, - # ) hidden_states = self.mlp(hidden_states) - # print( - # f"ht debug mlp rank:{gpc.get_global_rank()} out.shape:{hidden_states.shape} out:{hidden_states}", flush=True - # ) return hidden_states + residual @@ -423,11 +416,6 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N else: # Training hidden_states = self.head(hidden_states, gather_dim=0) - # print( - # f"ht debug head rank:{gpc.get_global_rank()} hidden_states.shape:{hidden_states.shape} hidden_states:{hidden_states}", - # flush=True, - # ) - if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) return hidden_states diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index eba5a6f1..2010e2f5 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -433,11 +433,9 @@ def _packed_forward(self, x, inference_params=None, **kwargs): split x during sequence parallel, we split the batch * seqlen dimension (in case batch is small). """ - # print(f"ht debug mha rank:{gpc.get_global_rank()} wqkv.shape:{self.Wqkv.weight.shape} wqkv:{self.Wqkv.weight}") qkv = self.Wqkv(x) # total x hsz' qkv = rearrange(qkv, "t (three h d) -> t three h d", three=3, d=self.head_dim) # total x 3 x n_head x d qkv = self.rotary_emb(qkv, **kwargs) - # print(f"ht debug mha rank:{gpc.get_global_rank()} qkv.shape:{qkv.shape} qkv:{qkv}", flush=True) kwargs.pop("indexes") if inference_params is None: if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn: @@ -452,12 +450,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs): raise RuntimeError("Not support this right now") context = rearrange(context, "b h d -> b (h d)") # recover the shape - # print(f"ht debug mha rank:{gpc.get_global_rank()} context.shape:{context.shape} context:{context}") - # print( - # f"ht debug mha rank:{gpc.get_global_rank()} out_proj.shape:{self.out_proj.weight.shape} out_proj:{self.out_proj.weight}" - # ) out = self.out_proj(context) - # print(f"ht debug mha rank:{gpc.get_global_rank()} out.shape:{out.shape} out:{out}") - return out diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 58880bb8..5c78b5e0 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -110,12 +110,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy elif not isinstance(param_groups, list): raise ValueError(f"Unknown param group type of {type(param_groups)}") - # print(f"ht debug params_groups before split total len:{len(param_groups[0]['params'])}", flush=True) - # create new groups for IS_TENSOR_DATA_PARALLEL parameter group new_groups = {} new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} - # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1} for pgroup in param_groups: # copy attribute from origin group, we assume the input param_groups only @@ -129,8 +126,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy for param in pgroup["params"]: if is_tensor_data_parallel_parameter(param): new_groups["embed_head"]["params"].append(param) - # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True: - # new_groups["layer_norm"]["params"].append(param) else: origin_params.append(param) @@ -144,10 +139,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy else: assert len(new_groups["embed_head"]["params"]) <= 0 - # print(f"ht debug params_groups after split default len:{len(param_groups[0]['params'])}", flush=True) - # print(f"ht debug params_groups after split embed_head len:{len(param_groups[1]['params'])}", flush=True) - # print(f"ht debug params_groups after split layer_norm len:{len(param_groups[2]['params'])}", flush=True) - return tuple(param_groups) diff --git a/sort_log.py b/sort_log.py deleted file mode 100644 index 786c2282..00000000 --- a/sort_log.py +++ /dev/null @@ -1,17 +0,0 @@ -import re - -# 读取日志信息 -with open("ht.log", "r") as file: - log_content = file.read() - -# 使用正则表达式提取以 "ht debug" 开头、以 "dtype=***" 结尾的日志信息块 -log_blocks = re.findall(r"ht debug.*?device=[^\n]*", log_content, re.DOTALL) - -# 将日志信息块按照 "rank:" 后的整数值进行正序排序 -sorted_log_blocks = sorted(log_blocks, key=lambda x: int(re.search(r"rank:(\d+)", x).group(1))) - -# 将排序后的日志信息块写入新的文件 -with open("sorted.log", "w") as file: - file.write("\n\n".join(sorted_log_blocks)) - -print("日志信息块已按照 rank: 后的整数值进行正序排序,并保存在 sorted_log_blocks.txt 文件中。") From 7ed1109c7091f5b498db14c2d342197e837f739e Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 17 Jan 2024 17:51:19 +0800 Subject: [PATCH 105/153] feat(model): fix lint error --- internlm/core/communication/isp.py | 4 ++ internlm/core/context/__init__.py | 4 +- internlm/core/context/parallel_context.py | 1 - internlm/core/scheduler/pipeline_scheduler.py | 5 +-- internlm/model/linear.py | 6 ++- internlm/model/modeling_internlm.py | 1 - .../solver/optimizer/hybrid_zero_optim.py | 2 +- internlm/solver/optimizer/utils.py | 12 ++++-- internlm/train/training_internlm.py | 38 +++++++++---------- internlm/utils/model_checkpoint.py | 15 ++++---- 10 files changed, 46 insertions(+), 42 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index ea628466..53fd731e 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -142,6 +142,10 @@ def reset_lazy_pools(self) -> None: class ISPOverlapState: + """ + Overlap state for isp. + """ + def __init__(self) -> None: self.num_blocks: int = 0 self.embedding: List[nn.Module] = [] diff --git a/internlm/core/context/__init__.py b/internlm/core/context/__init__.py index 13da8f58..8ff56c31 100644 --- a/internlm/core/context/__init__.py +++ b/internlm/core/context/__init__.py @@ -1,7 +1,7 @@ from .parallel_context import ( - IS_TENSOR_ZERO_PARALLEL, - IS_TENSOR_DATA_PARALLEL, IS_REPLICA_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, IS_WEIGHT_ZERO_PARALLEL, Config, ParallelContext, diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 826b51a1..378faebd 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -24,7 +24,6 @@ from .process_group_initializer import ParallelMode from .random import add_seed, get_seeds, set_mode - IS_REPLICA_ZERO_PARALLEL = "is_replica_zero_parallel" # for isp, with optimizer split in dp group IS_TENSOR_DATA_PARALLEL = "is_tensor_data_parallel" diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index 622c91f6..778331ee 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -133,10 +133,7 @@ def __init__( tensor_shape if tensor_shape is None or isinstance(tensor_shape, torch.Size) else torch.Size(tensor_shape) ) - self.scatter_gather_tensors = ( - scatter_gather_tensors - and gpc.is_using_parallel_mode(ParallelMode.TENSOR) - ) + self.scatter_gather_tensors = scatter_gather_tensors and gpc.is_using_parallel_mode(ParallelMode.TENSOR) if gpc.config.parallel.sequence_parallel: self.scatter_gather_tensors = False diff --git a/internlm/model/linear.py b/internlm/model/linear.py index ed21a21b..9506f608 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -12,8 +12,8 @@ from internlm.core.context import global_context as gpc from internlm.model.utils import ( Silu, - isp_fused_dense_func, fused_dense_func_torch, + isp_fused_dense_func, megatron_fused_dense_func_torch, ) @@ -351,6 +351,10 @@ def __init__( class ISPLinear(ColumnParallelLinear): + """ + Linear class for isp tensor parallel mode. + """ + # class level communicator variable. __communicator = None diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 32c1c7b0..7bb9ffa7 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -31,7 +31,6 @@ from internlm.utils.logger import get_logger from internlm.utils.registry import MODEL_INITIALIZER - MODEL_TYPE = "INTERNLM" logger = get_logger(__file__) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index f7ce3bdc..1445509d 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -13,8 +13,8 @@ from internlm.core.context import global_context as gpc from internlm.core.context.parallel_context import ( IS_TENSOR_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, IS_TENSOR_ZERO_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, ) from internlm.monitor import send_alert_message from internlm.solver.optimizer.store import ( diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 42a9949f..a9027705 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -324,11 +324,14 @@ def compute_norm( """ Sum across all model-parallel GPUs. - 1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and + 1. For the IS_REPLICA_ZERO_PARALLEL parameter group, gradients from rank 0 in the tp/wp process group and gradients along the pp+zero dimensions from all ranks should be aggregated. - 2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions from all ranks should be aggregated. - 3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions from all ranks should be aggregated. - 4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions from all ranks should be aggregated. + 2. For the IS_TENSOR_DATA_PARALLEL parameter group, gradients along the tp+pp+zero(dp) dimensions + from all ranks should be aggregated. + 3. For the IS_TENSOR_ZERO_PARALLEL parameter group, gradients along the tp+pp+zero dimensions + from all ranks should be aggregated. + 4. For the IS_WEIGHT_ZERO_PARALLEL parameter group, gradients along the wp+pp+zero dimensions + from all ranks should be aggregated. """ if is_tensor_data_parallel_parameter(parameters[0]): if gpc.is_using_parallel_mode(ParallelMode.TENSOR): @@ -368,6 +371,7 @@ def compute_norm( return total_norm +# ht mark: TODO def compute_param_norm( gradients, parameters, diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2822da5a..31b6238f 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -20,6 +20,14 @@ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy from torch.utils.data import ConcatDataset, DataLoader +from internlm.core.communication.isp import ISPCommModelConfig, ISPCommunicator +from internlm.core.context import ( + IS_REPLICA_ZERO_PARALLEL, + IS_TENSOR_DATA_PARALLEL, + IS_TENSOR_ZERO_PARALLEL, + IS_WEIGHT_ZERO_PARALLEL, + ParallelMode, +) from internlm.core.context import global_context as gpc from internlm.core.context.random import set_mode from internlm.core.naive_amp import NaiveAMPModel @@ -36,16 +44,15 @@ from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data from internlm.model.embedding import Embedding1D from internlm.model.linear import ( - FeedForward, - RewardModelLinear, - ScaleColumnParallelLinear, BaseScaleColumnParallelLinear, ColumnParallelLinear, + FeedForward, + ISPLinear, + RewardModelLinear, RowParallelLinear, + ScaleColumnParallelLinear, ) from internlm.model.multi_head_attention import MHA -from internlm.model.linear import ISPLinear -from internlm.core.communication.isp import ISPCommunicator, ISPCommModelConfig from internlm.model.utils import try_import_RMSNorm from internlm.monitor import send_heartbeat, set_env_var from internlm.monitor.monitor import monitor_manager as mm @@ -58,25 +65,16 @@ from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( + is_replica_zero_parallel_parameter, + is_tensor_data_parallel_parameter, + is_tensor_zero_parallel_parameter, + is_weight_zero_parallel_parameter, set_model_params_layer_name, sync_model_param, sync_model_replica_param_group, ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout -from internlm.core.context import ( - IS_TENSOR_ZERO_PARALLEL, - IS_REPLICA_ZERO_PARALLEL, - IS_TENSOR_DATA_PARALLEL, - IS_WEIGHT_ZERO_PARALLEL, - ParallelMode, -) -from internlm.utils.parallel import ( - is_replica_zero_parallel_parameter, - is_tensor_data_parallel_parameter, - is_tensor_zero_parallel_parameter, - is_weight_zero_parallel_parameter, -) RMSNorm = try_import_RMSNorm() logger = get_logger(__file__) @@ -90,9 +88,7 @@ def _check_module(module): setattr(param, IS_REPLICA_ZERO_PARALLEL, True) # embedding and head - if isinstance(module, (Embedding1D, ParallelGPT2Embeddings)) or isinstance( - module, BaseScaleColumnParallelLinear - ): + if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)): for param in module.parameters(): if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp": setattr(param, IS_TENSOR_DATA_PARALLEL, True) diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 5c21af90..4fe45d5e 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -281,7 +281,6 @@ def save_model_checkpoint(folder, model): if folder is not None: dp_size = gpc.get_world_size(ParallelMode.DATA) tp_size = gpc.get_world_size(ParallelMode.TENSOR) - wp_size = gpc.get_world_size(ParallelMode.WEIGHT) dp_rank = gpc.get_local_rank(ParallelMode.DATA) tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) @@ -574,13 +573,15 @@ def load_optimizer_checkpoint(folder, optim): dp_size = gpc.get_world_size(ParallelMode.DATA) if gpc.config.parallel.tensor.mode == "isp": - assert ( - dp_size == max_dp + 1 - ), f"The optimizer states are save for {max_dp+1} data parallelism, while current has {dp_size} data parallelism" + assert dp_size == max_dp + 1, ( + f"The optimizer states are save for {max_dp+1} data parallelism, " + f"while current has {dp_size} data parallelism" + ) if gpc.config.parallel.tensor.mode != "isp": - assert ( - zero_size == max_zero + 1 - ), f"The optimizer states are save for {max_zero+1} zero parallel, while current has {zero_size} zero broadcast range." + assert zero_size == max_zero + 1, ( + f"The optimizer states are save for {max_zero+1} zero parallel, " + f"while current has {zero_size} zero broadcast range." + ) assert ( pp_size == max_pp + 1 ), f"The optimizer states are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" From ccc2108440530b46fe2be92f37ba6f27062646c2 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Thu, 18 Jan 2024 15:16:14 +0800 Subject: [PATCH 106/153] refactor code --- internlm/train/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 6b62dbc8..b4a98db9 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -41,7 +41,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy if gpc.config.parallel.tensor.mode == "isp": new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1} - # new_groups["layer_norm"] = {"name": "layer_norm", "params": [], "optimizer_mode": ParallelMode.ZERO1} if gpc.config.model.get("num_experts", 0) > 1: for key in gpc.expert_parallel_group_names: From fac2b200e3c98ad895e0048f47c68b71dc2de2db Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Thu, 18 Jan 2024 17:37:54 +0800 Subject: [PATCH 107/153] refactor code --- internlm/train/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/internlm/train/utils.py b/internlm/train/utils.py index b4a98db9..76c375de 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -59,8 +59,6 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy if is_tensor_data_parallel_parameter(param): # should not be here if not isp mode new_groups["embed_head"]["params"].append(param) - # elif hasattr(param, IS_REPLICA_ZERO_PARALLEL) and getattr(param, IS_REPLICA_ZERO_PARALLEL) is True: - # new_groups["layer_norm"]["params"].append(param) elif param.dtype == torch.float32: new_groups["fp32"]["params"].append(param) # moe param means MoE is enabled From 05fa04a33cb331bd4245b4271eb6e111cbfd55b9 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 11:32:09 +0800 Subject: [PATCH 108/153] feat(multi_head_attention.py): set bias=True --- internlm/model/multi_head_attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 023b5478..87e2d42a 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -209,7 +209,7 @@ def __init__( embed_dim, 3 * embed_dim, process_group, - bias=False, + bias=True, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) # according to https://spaces.ac.cn/archives/9577 @@ -232,7 +232,7 @@ def __init__( embed_dim, embed_dim, process_group, - bias=False, + bias=True, sequence_parallel=gpc.config.parallel.sequence_parallel, **factory_kwargs, ) From 91bd3f96608351dd99b7f1ac7f77eb05aa92fa78 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Fri, 19 Jan 2024 14:51:09 +0800 Subject: [PATCH 109/153] fix bugs --- internlm/solver/optimizer/hybrid_zero_optim.py | 2 +- internlm/train/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 13139938..978a7b4f 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -177,7 +177,7 @@ def __init__( if self._is_moe_group(param_group): grad_reduce_mode = ParallelMode.EXPERT_DATA - if param_group["name"] != "embed_head" and self.use_isp: + elif param_group["name"] != "embed_head" and self.use_isp: grad_reduce_mode = ParallelMode.WEIGHT_DATA else: grad_reduce_mode = ParallelMode.DATA diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 76c375de..4444b30d 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -42,7 +42,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1} - if gpc.config.model.get("num_experts", 0) > 1: + if gpc.config.model.get("num_experts", 1) > 1: for key in gpc.expert_parallel_group_names: new_groups[key] = {"name": key, "moe": True, "params": [], "optimizer_mode": ParallelMode.EXPERT_DATA} From 20f6b36108aed9409fa68cc2fb7b7381d04d22e2 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Fri, 19 Jan 2024 15:01:11 +0800 Subject: [PATCH 110/153] support moe checkpoint --- internlm/utils/model_checkpoint.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 322ddf1e..25455231 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -354,14 +354,14 @@ def save_model_checkpoint(folder, model): llm_save(topo_fp, saved_obj=topo) # try to save expert parameter to separate files if model have moe layer - # expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) - # expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) - # should_save_rank_pair.clear() - # for i in range(tp_size): - # should_save_rank_pair.add((i, i % expert_dp_size)) - - # if (tp_rank, expert_dp_rank) in should_save_rank_pair: - # try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) + expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) + expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) + should_save_rank_pair.clear() + for i in range(tp_size): + should_save_rank_pair.add((i, i % expert_dp_size)) + + if (tp_rank, expert_dp_rank) in should_save_rank_pair: + try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) torch.distributed.barrier() From 7cdeea870de5bc556dcb8515aad6d9707a20daba Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 16:58:27 +0800 Subject: [PATCH 111/153] fix(tests): fix ci test error --- internlm/utils/parallel.py | 23 -------------- tests/test_core/utils.py | 9 ++---- tests/test_training/test_loss.py | 3 +- .../test_swap_nb_loss_and_gradnorm.py | 31 +++++++++---------- tests/test_training/train_CI.py | 3 +- 5 files changed, 19 insertions(+), 50 deletions(-) diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 52ffc114..2614fe11 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -132,26 +132,3 @@ def set_model_params_layer_name(model): layer_param_name = f"{layer_name}-{param_name}" param.__setattr__("layer_name", layer_name) param.__setattr__("param_name", f"{layer_name}-{param_name}") - - -def check_sequence_parallel(model): - """ - check whether the norm module has IS_SEQUENCE_PARALLEL attribute. - when the sequence_parallel is True, the norm module should have the IS_SEQUENCE_PARALLEL attribute - to illustrate the norm should conduct the all-reduce for its grad. - """ - - if not isinstance(model, nn.ModuleList): - model = [model] - - for _chunk in model: - if isinstance(_chunk, NaiveAMPModel): - _chunk = _chunk.model - - for _, module in _chunk.named_modules(): - if isinstance(module, (RMSNorm, nn.LayerNorm)): - for param in module.parameters(): - assert hasattr(param, IS_SEQUENCE_PARALLEL), ( - "when the gpc.config.parallel.sequence parallel is True," - "the params of norm module should have IS_SEQUENCE_PARALLEL attribute" - ) diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py index 6f66a152..3d25667f 100644 --- a/tests/test_core/utils.py +++ b/tests/test_core/utils.py @@ -10,12 +10,8 @@ from internlm.core.context import global_context as gpc from internlm.core.engine import Engine from internlm.core.gradient_handler import PipelineSharedModuleGradientHandler -from internlm.core.scheduler import ( - InterleavedPipelineScheduler, - NonPipelineScheduler, - PipelineScheduler, - SchedulerMetricHook, -) +from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler +from internlm.model.metrics import SchedulerMetricHook from internlm.solver.pipeline_utils import partition_uniform from internlm.train import initialize_optimizer @@ -156,7 +152,6 @@ def build_environment(rank, world_size, config): def loose_close(a, b, dtype: torch.dtype = torch.float32): - if dtype is torch.float32: rtol = 1.3e-6 atol = 1e-5 diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index 51f49836..a3b3b442 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -9,11 +9,10 @@ import internlm from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.core.scheduler import SchedulerMetricHook from internlm.core.trainer import TrainState from internlm.initialize import initialize_distributed_env from internlm.model.loss import FlashGPTLMLoss -from internlm.model.metrics import AccPerplex +from internlm.model.metrics import AccPerplex, SchedulerMetricHook from internlm.train import ( get_train_data_loader, initialize_model, diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py index 64ed29dd..d9c6ac81 100644 --- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py +++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py @@ -13,10 +13,9 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.context.parallel_context import Config -from internlm.core.scheduler import SchedulerMetricHook from internlm.initialize.launch import args_sanity_check from internlm.model.loss import FlashGPTLMLoss -from internlm.model.metrics import AccPerplex +from internlm.model.metrics import AccPerplex, SchedulerMetricHook from internlm.train import ( get_train_data_loader, get_validation_data_loader, @@ -226,10 +225,10 @@ def compute_trimmed_mean(value_list): def check_grad_norm(grad_norm_list): - standard_grad_norm_list = torch.load(os.path.join( - os.environ["share_path"], "quailty_assurance/small_300step_norm_grad/grad_norm_list.pt" - )) - + standard_grad_norm_list = torch.load( + os.path.join(os.environ["share_path"], "quailty_assurance/small_300step_norm_grad/grad_norm_list.pt") + ) + standard_grad_norm_list = standard_grad_norm_list[-100:] grad_norm_list = grad_norm_list[-100:] standard_grad_norm_list.sort() @@ -239,18 +238,18 @@ def check_grad_norm(grad_norm_list): trimmed_mean2 = compute_trimmed_mean(grad_norm_list) tensor_trimmed_mean1 = torch.tensor(trimmed_mean1) tensor_trimmed_mean2 = torch.tensor(trimmed_mean2) - + logger.info(f"norm_mean: {tensor_trimmed_mean1}, {tensor_trimmed_mean2}") assert torch.allclose(tensor_trimmed_mean1, tensor_trimmed_mean2, rtol=3e-1, atol=3e-1) logger.info(f"grad norm check passed") - + def check_meanLoss_val(all_loss, all_val): loss_values1 = all_loss[0][-100:] loss_values2 = all_loss[1][-100:] loss_values1.sort() loss_values2.sort() - + trimmed_mean1 = compute_trimmed_mean(loss_values1) trimmed_mean2 = compute_trimmed_mean(loss_values2) tensor_trimmed_mean1 = torch.tensor(trimmed_mean1) @@ -261,9 +260,9 @@ def check_meanLoss_val(all_loss, all_val): assert torch.allclose(tensor_trimmed_mean1, tensor_trimmed_mean2, rtol=3e-2, atol=3e-2) assert torch.allclose(torch.tensor(all_val[0]), torch.tensor(all_val[1]), rtol=3e-2, atol=3e-2) - + logger.info(f"loss check passed") - + def exam_loss(args): # init @@ -363,12 +362,12 @@ def exam_loss(args): # update parameters trainer_result = trainer.step() assert trainer_result is not None - + _, grad_norm_groups = trainer_result - + if gpc.is_rank_for_log(): logger.info(f"train_grad_norm_groups: {grad_norm_groups['0_default']}") - grad_norm_list.append(grad_norm_groups['0_default']) + grad_norm_list.append(grad_norm_groups["0_default"]) # evaluate on validation data loaders if valid_every > 0 and batch_count > 0 and (batch_count + 1) % valid_every == 0: @@ -381,10 +380,10 @@ def exam_loss(args): torch.cuda.empty_cache() dist.barrier() - + if gpc.is_rank_for_log(): check_grad_norm(grad_norm_list) - + return rank, loss_list, val_list diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py index 98a69c9f..507cace1 100644 --- a/tests/test_training/train_CI.py +++ b/tests/test_training/train_CI.py @@ -19,11 +19,10 @@ import internlm # noqa: E402 from internlm.core.context import ParallelMode # noqa: E402 from internlm.core.context import global_context as gpc # noqa: E402 -from internlm.core.scheduler import SchedulerMetricHook # noqa: E402 from internlm.core.trainer import TrainState # noqa: E402 from internlm.initialize import initialize_distributed_env # noqa: E402 from internlm.model.loss import FlashGPTLMLoss # noqa: E402 -from internlm.model.metrics import AccPerplex # noqa: E402 +from internlm.model.metrics import AccPerplex, SchedulerMetricHook # noqa: E402 from internlm.monitor import ( # noqa: E402 initialize_monitor_manager, send_alert_message, From f959781b318e3b8a8829d90605b0f17922cbfbf7 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 17:53:09 +0800 Subject: [PATCH 112/153] fix(tests): fix ci test error --- .github/workflows/demo_in_readme.yaml | 1 + configs/13B_sft.py | 180 --------------------- configs/13B_template.py | 180 --------------------- configs/20B_sft.py | 180 --------------------- configs/30B_sft.py | 180 --------------------- configs/30B_template.py | 180 --------------------- configs/7B_sft.py | 10 +- configs/7B_template.py | 181 ---------------------- configs/generate.py | 53 ------- tests/test_core/test_pipeline.py | 6 +- tests/test_data/test_batch_sampler.py | 6 +- tests/test_model/test_model_internlm.py | 6 +- tests/test_training/7B_check_init.py | 2 +- tests/test_utils/common_fixture.py | 6 +- tests/test_utils/test_model_checkpoint.py | 3 - 15 files changed, 19 insertions(+), 1155 deletions(-) delete mode 100644 configs/13B_sft.py delete mode 100644 configs/13B_template.py delete mode 100644 configs/20B_sft.py delete mode 100644 configs/30B_sft.py delete mode 100644 configs/30B_template.py delete mode 100644 configs/7B_template.py delete mode 100644 configs/generate.py diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index 8445ad09..a3840347 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -111,6 +111,7 @@ jobs: srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py cd .. rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak + load-chat-model-in-hf: if: ${{ !cancelled() }} needs: check-requirements diff --git a/configs/13B_sft.py b/configs/13B_sft.py deleted file mode 100644 index e3e17ae0..00000000 --- a/configs/13B_sft.py +++ /dev/null @@ -1,180 +0,0 @@ -JOB_NAME = "13b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="origin_tp"), - pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_template.py b/configs/13B_template.py deleted file mode 100644 index 849c5aa9..00000000 --- a/configs/13B_template.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = {seq_len} -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/20B_sft.py b/configs/20B_sft.py deleted file mode 100644 index 13e68b22..00000000 --- a/configs/20B_sft.py +++ /dev/null @@ -1,180 +0,0 @@ -JOB_NAME = "20b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, mode="fstp", overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_sft.py b/configs/30B_sft.py deleted file mode 100644 index 8bde0571..00000000 --- a/configs/30B_sft.py +++ /dev/null @@ -1,180 +0,0 @@ -JOB_NAME = "30b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=4, fsdp=False), - tensor=dict(size=8, mode="fstp", overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_template.py b/configs/30B_template.py deleted file mode 100644 index d19ece6e..00000000 --- a/configs/30B_template.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = {seq_len} -JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index e4028e80..66ffe0d0 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -56,8 +56,8 @@ valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=True, - total_steps=10, + pack_sample_into_one=False, + total_steps=50000, skip_batches="", # rampup_batch_size (str): A string with three space-separated integers representing the # starting batch size, the increment, and the number of steps between @@ -172,9 +172,9 @@ 3. memory_pool: bool, enable/disable memory pool, defaults to False. """ parallel = dict( - zero1=dict(size=2, fsdp=False), - tensor=dict(size=2, mode="mtp"), - pipeline=dict(size=2, interleaved_overlap=True), + zero1=dict(size=8, fsdp=False), + tensor=dict(size=1, mode="mtp"), + pipeline=dict(size=1, interleaved_overlap=True), weight=dict(size=1, overlap=True, memory_pool=True), ) diff --git a/configs/7B_template.py b/configs/7B_template.py deleted file mode 100644 index d78fc884..00000000 --- a/configs/7B_template.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint={checkpoint}, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp={sp}, intern_overlap={intern_overlap}), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/generate.py b/configs/generate.py deleted file mode 100644 index 5f044e72..00000000 --- a/configs/generate.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import copy -import subprocess - -name = "./configs/" -root_names = ["7B_train_", "13B_train_", "30B_train_"] -model_size = ["7B", "13B", "30B"] -seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144] -sp = ["none", "megatron", "flash-attn", "intern", "intern"] -intern_overlap = [False, False, False, True, False] -checkpoint = [False, True] - -for idx, root_name in enumerate(root_names): - - # 指定要创建的文件夹路径 - folder_path = name + root_name[:-1] - - # 使用os.mkdir()创建文件夹 - if not os.path.exists(folder_path): - os.mkdir(folder_path) - - file_name = name + f"{model_size[idx]}_template.py" - - with open(file_name, "r") as f: - lines = f.readlines() - origin_line = "".join(lines) - for seq in seq_length: - for i, sp_mode in enumerate(sp): - for ckpt in checkpoint: - line = copy.copy(origin_line) - line = line.replace("{seq_len}", str(seq)) - line = line.replace("{sp}", f"\"{sp_mode}\"") - line = line.replace("{intern_overlap}", str(intern_overlap[i])) - line = line.replace("{checkpoint}", str(ckpt)) - output_file_name = str(seq) + "_" + str(sp_mode) + "_overlap_" + str(intern_overlap[i]) + "_ckpt_" + str(ckpt) + ".py" - write_file = folder_path + "/" + output_file_name - with open(write_file, "w") as file: - file.write(line) - - log_name = root_name + "_" + output_file_name[:-3] - - skip = True - - if sp_mode == "intern" and intern_overlap[i] is True: - skip = False - - if skip: - continue - - print(log_name) - command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" - process = subprocess.Popen(command, shell=True, executable='/bin/bash') - process.wait() \ No newline at end of file diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py index 4b37f61b..db7b3ddd 100644 --- a/tests/test_core/test_pipeline.py +++ b/tests/test_core/test_pipeline.py @@ -20,9 +20,9 @@ gradient_handler=[dict(type="PipelineSharedModuleGradientHandler")], parallel=dict( zero1=dict(size=1, fsdp=False), - pipeline=dict(size=8, interleaved_overlap=False), - sequence_parallel=False, - tensor=1, + tensor=dict(size=1, mode="mtp"), + pipeline=dict(size=8, interleaved_overlap=True), + weight=dict(size=1, overlap=True, memory_pool=True), ), model_type="INTERNLM", data=dict(seq_len=8, micro_num=16, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999), diff --git a/tests/test_data/test_batch_sampler.py b/tests/test_data/test_batch_sampler.py index eb835b2d..1faf4aee 100644 --- a/tests/test_data/test_batch_sampler.py +++ b/tests/test_data/test_batch_sampler.py @@ -164,9 +164,9 @@ def test_warmup(use_flash_atten_case, group_case, micro_bsz_case): dict( parallel=dict( zero1=dict(size=1, fsdp=False), - pipeline=dict(size=1, interleaved_overlap=False), - sequence_parallel=False, - tensor=1, + tensor=dict(size=1, mode="mtp"), + pipeline=dict(size=1, interleaved_overlap=True), + weight=dict(size=1, overlap=True, memory_pool=True), ), data=dict( train_folder=None, diff --git a/tests/test_model/test_model_internlm.py b/tests/test_model/test_model_internlm.py index 9b6066ec..4c239c0f 100644 --- a/tests/test_model/test_model_internlm.py +++ b/tests/test_model/test_model_internlm.py @@ -18,9 +18,9 @@ dict( parallel=dict( zero1=dict(size=1, fsdp=False), - pipeline=dict(size=1, interleaved_overlap=False), - sequence_parallel=False, - tensor=1, + tensor=dict(size=1, mode="mtp"), + pipeline=dict(size=1, interleaved_overlap=True), + weight=dict(size=1, overlap=True, memory_pool=True), ), model_type="INTERNLM", data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999), diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py index de6dcb2a..179892b6 100644 --- a/tests/test_training/7B_check_init.py +++ b/tests/test_training/7B_check_init.py @@ -157,7 +157,7 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=dict(size=-1, fsdp=False), + zero1=dict(size=1, fsdp=False), tensor=4, pipeline=dict(size=2, interleaved_overlap=True), sequence_parallel=False, diff --git a/tests/test_utils/common_fixture.py b/tests/test_utils/common_fixture.py index 60961565..96d3188c 100644 --- a/tests/test_utils/common_fixture.py +++ b/tests/test_utils/common_fixture.py @@ -44,9 +44,9 @@ dict( parallel=dict( zero1=dict(size=1, fsdp=False), - pipeline=dict(size=1, interleaved_overlap=False), - sequence_parallel=False, - tensor=1, + tensor=dict(size=1, mode="mtp"), + pipeline=dict(size=1, interleaved_overlap=True), + weight=dict(size=1, overlap=True, memory_pool=True), ), model_type="INTERNLM", adam=dict( diff --git a/tests/test_utils/test_model_checkpoint.py b/tests/test_utils/test_model_checkpoint.py index 2dcabf4e..2063591c 100644 --- a/tests/test_utils/test_model_checkpoint.py +++ b/tests/test_utils/test_model_checkpoint.py @@ -16,8 +16,6 @@ LOCAL_SAVE_PATH, del_tmp_file, init_config, - init_dist_and_model, - reset_singletons, ) # (TOTAL_STEP, CKPT_EVERY, SNPASHOT_EVERY) @@ -164,7 +162,6 @@ def return_prefix_path(save_ckpt_folder): def return_latest_save_path(save_ckpt_folder, total_step, snapshot_freq, ckpt_freq): - snapshot_latest_step, normal_latest_step = 0, 0 snapshot_latest_count, normal_latest_count = 0, 0 From e873668a9d61de03f2d039ff599163f5767d5d70 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 18:59:14 +0800 Subject: [PATCH 113/153] fix(tests): fix ci test error --- internlm/core/context/parallel_context.py | 9 ++- .../core/context/process_group_initializer.py | 64 ------------------- internlm/utils/evaluation.py | 8 +-- 3 files changed, 10 insertions(+), 71 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 829b2d90..3a688f9b 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -148,6 +148,7 @@ def __init__(self): self.data_parallel_size = 1 self.pipeline_parallel_size = 1 self.tensor_parallel_size = 1 + self.weight_parallel_size = 1 self.zero1_parallel_size = -1 self.nettest_parallel_size = 1 self.expert_parallel_size = -1 @@ -483,11 +484,12 @@ def init_parallel_groups(self): # the user should not set the data parallel size manually # instead, it should be calculated based on other parallel config - assert self.zero1_parallel_size >= 1 self.sequence_parallel_size = self.tensor_parallel_size self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size if parallel_config["tensor"]["mode"] != "isp": + if self.zero1_parallel_size == -1: + self.zero1_parallel_size = self.data_parallel_size assert ( self.zero1_parallel_size <= self.data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" @@ -495,6 +497,8 @@ def init_parallel_groups(self): self.data_parallel_size % self.zero1_parallel_size == 0 ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" else: + if self.zero1_parallel_size == -1: + self.zero1_parallel_size = self.weight_data_parallel_size assert ( self.zero1_parallel_size <= self.weight_data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}" @@ -502,6 +506,7 @@ def init_parallel_groups(self): f"weight_data_parallel_size:{self.weight_data_parallel_size} % " f"zero1_parallel_size: {self.zero1_parallel_size} != 0" ) + assert self.zero1_parallel_size >= 1 # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 @@ -538,8 +543,6 @@ def init_parallel_groups(self): if parallel_config["tensor"]["mode"] == "isp": initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Data(*initializer_args)) - # if self.weight_parallel_size <= 1: - # initializers.append(pgroup_initializer.Initializer_Model(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args)) if parallel_config["tensor"]["mode"] != "isp": initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py index 5e59df22..76d42056 100644 --- a/internlm/core/context/process_group_initializer.py +++ b/internlm/core/context/process_group_initializer.py @@ -112,70 +112,6 @@ def init_dist_group(self, use_cpu: bool = False): pass -# class Initializer_Model(ProcessGroupInitializer): -# """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel -# groups). - -# Args: -# rank (int): The rank of current process. -# world_size (int): Size of whole communication world. -# weight_parallel_size (int): Size of model weight parallel. -# weight_data_parallel_size (int): Size of data parallel for common weight. -# sequence_parallel_size (int): Size of data sequence parallel. -# data_parallel_size (int): Size of data parallel. -# pipeline_parallel_size (int): Size of pipeline parallel. -# tensor_parallel_size (int): Size of tensor parallel. -# zero1_parallel_size (int): Size of zero1 parallel. -# nettest_parallel_size (int): Size of net testing parallel. -# expert_parallel_size (int): Size of expert parallel. -# """ - -# def __init__(self, *args, **kwargs): -# super().__init__(*args, **kwargs) - -# # only for msp or fsp -# assert self.weight_parallel_size == 1 -# self.rank_num_per_group = self.tensor_parallel_size * self.pipeline_parallel_size -# self.num_group = self.world_size // self.rank_num_per_group - -# assert self.world_size % self.rank_num_per_group == 0 - -# def init_dist_group(self, use_cpu: bool = False): -# """Initialize model parallel groups, and assign local_ranks and groups to each gpu. - -# Returns: -# Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode): -# A Model parallelism's information tuple. -# """ -# local_rank = None -# ranks_in_group = None -# process_group = None -# cpu_group = None -# group_world_size = None -# mode = ParallelMode.MODEL - -# for i in range(self.num_group): -# ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)] -# group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT) -# if use_cpu: -# group_cpu = ( -# dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) -# if dist.get_backend() != "gloo" -# else group -# ) -# else: -# group_cpu = None - -# if self.rank in ranks: -# local_rank = ranks.index(self.rank) -# group_world_size = len(ranks) -# process_group = group -# cpu_group = group_cpu -# ranks_in_group = ranks - -# return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode - - class Initializer_Pipeline(ProcessGroupInitializer): """A ProcessGroupInitializer for pipeline parallelism. diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index e85773fd..e52586b8 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -10,7 +10,7 @@ @contextmanager -def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, grad_accum_batch_size, metric_hook_list): +def switch_evaluation_no_pipeline_scheduler(trainer, grad_accum_size, metric_hook_list): if not gpc.is_using_parallel_mode(ParallelMode.PIPELINE): prev_data_process_func = trainer.schedule.data_process_func prev_grad_accum_size = trainer.schedule._grad_accum_size @@ -50,10 +50,10 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape def switch_sequence_parallel_mode(): prev_mode = gpc.config.parallel.sequence_parallel try: - if gpc.config.parallel["tensor"]["sp"] == "intern": - gpc.config.parallel.sequence_parallel = True - else: + if gpc.config.parallel["tensor"]["mode"] == "mtp": gpc.config.parallel.sequence_parallel = False + else: + gpc.config.parallel.sequence_parallel = True yield finally: gpc.config.parallel.sequence_parallel = prev_mode From b99a6422f84473ba58db64dd88aa2e916e3998b0 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 19:19:22 +0800 Subject: [PATCH 114/153] fix(tests): fix ci test error --- internlm/core/context/parallel_context.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 3a688f9b..03fd6736 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -487,7 +487,9 @@ def init_parallel_groups(self): self.sequence_parallel_size = self.tensor_parallel_size self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size - if parallel_config["tensor"]["mode"] != "isp": + if isinstance(parallel_config["tensor"], int) or ( + isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] != "isp" + ): if self.zero1_parallel_size == -1: self.zero1_parallel_size = self.data_parallel_size assert ( @@ -508,6 +510,14 @@ def init_parallel_groups(self): ) assert self.zero1_parallel_size >= 1 + # set sequence parallel value + if "sequence_parallel" not in parallel_config: + parallel_config._add_item("sequence_parallel", True) + if isinstance(parallel_config["tensor"], int) or ( + isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "mtp" + ): + parallel_config["sequence_parallel"] = False + # the recommended nettest_parallel_size is 32 GPUs self.nettest_parallel_size = 32 From 7ac53bf00f3bfb7845f5ab123991b84313879fd5 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 19:52:08 +0800 Subject: [PATCH 115/153] fix(tests): fix ci test error --- internlm/core/context/parallel_context.py | 35 +++++++++++------------ internlm/initialize/launch.py | 2 ++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 03fd6736..a7892434 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -487,18 +487,7 @@ def init_parallel_groups(self): self.sequence_parallel_size = self.tensor_parallel_size self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size - if isinstance(parallel_config["tensor"], int) or ( - isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] != "isp" - ): - if self.zero1_parallel_size == -1: - self.zero1_parallel_size = self.data_parallel_size - assert ( - self.zero1_parallel_size <= self.data_parallel_size - ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" - assert ( - self.data_parallel_size % self.zero1_parallel_size == 0 - ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" - else: + if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp": if self.zero1_parallel_size == -1: self.zero1_parallel_size = self.weight_data_parallel_size assert ( @@ -508,6 +497,15 @@ def init_parallel_groups(self): f"weight_data_parallel_size:{self.weight_data_parallel_size} % " f"zero1_parallel_size: {self.zero1_parallel_size} != 0" ) + else: + if self.zero1_parallel_size == -1: + self.zero1_parallel_size = self.data_parallel_size + assert ( + self.zero1_parallel_size <= self.data_parallel_size + ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" + assert ( + self.data_parallel_size % self.zero1_parallel_size == 0 + ), f"data_parallel_size:{self.data_parallel_size} % zero1_parallel_size: {self.zero1_parallel_size} != 0" assert self.zero1_parallel_size >= 1 # set sequence parallel value @@ -550,15 +548,14 @@ def init_parallel_groups(self): # run initialization of different process groups initializers = [] initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args)) - if parallel_config["tensor"]["mode"] == "isp": - initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) - initializers.append(pgroup_initializer.Initializer_Data(*initializer_args)) + initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args)) - if parallel_config["tensor"]["mode"] != "isp": - initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) - else: + initializers.append(pgroup_initializer.Initializer_Data(*initializer_args)) + if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp": initializers.append(pgroup_initializer.Initializer_Zero1_ISP(*initializer_args)) - if isinstance(self.config.parallel.zero1, dict) and self.config.parallel.zero1.get("fsdp", False): + else: + initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args)) + if isinstance(parallel_config["zero1"], dict) and parallel_config["zero1"].get("fsdp", False): initializers.append(pgroup_initializer.Initializer_Zero3_dp(*initializer_args)) initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args)) if self.pipeline_parallel_size > 1: diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 47b3c11c..ed3dcad5 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -324,6 +324,8 @@ def args_sanity_check(): gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode="mtp") if gpc.config.parallel["tensor"].get("mode", None) is None: gpc.config.parallel["tensor"]["mode"] = "mtp" + if gpc.config.parallel["tensor"]["mode"] == "isp": + assert not gpc.config.parallel.zero1.fsdp, "FSDP does not support isp" assert gpc.config.parallel["tensor"].get("mode", None) in [ "mtp", "msp", From bb5835e43a9b320c400b57a30f67fc3147086a01 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 19 Jan 2024 20:05:14 +0800 Subject: [PATCH 116/153] fix(tests): fix ci test error --- internlm/core/context/parallel_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index a7892434..9cc6bcdd 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -636,7 +636,7 @@ def set_seed(self, seed: int, dpseed_with_tpoffset: bool = False): # during model construction), this is because the random state will be different in different tensor parallel # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform # additional random operations during the RowParallelLinear module building process. - # set_mode(ParallelMode.DUMMY) + set_mode(ParallelMode.DUMMY) if self.is_using_parallel_mode(ParallelMode.TENSOR): set_mode(ParallelMode.TENSOR) if self.is_using_parallel_mode(ParallelMode.WEIGHT): From d5872e712fe3be376ca4dc390109ab1f4fbefcc6 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 22 Jan 2024 10:54:11 +0800 Subject: [PATCH 117/153] fix(tests): fix ci test error --- tests/test_data/test_batch_sampler.py | 2 +- tests/test_training/test_loss.py | 2 +- tests/test_training/test_swap_nb_loss_and_gradnorm.py | 4 ++-- tests/test_training/train_CI.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_data/test_batch_sampler.py b/tests/test_data/test_batch_sampler.py index 1faf4aee..e756d58a 100644 --- a/tests/test_data/test_batch_sampler.py +++ b/tests/test_data/test_batch_sampler.py @@ -123,7 +123,7 @@ def do_warmup(args): # test no-packed datasets. for _, val_dl in val_dls.items(): for _, batch in enumerate(val_dl): - if gpc.is_using_pp(): + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): total_val_bsz = len(batch[1]) batch[0]["input_ids"] = batch[0]["input_ids"].to(torch.bfloat16) assert total_val_bsz % micro_bsz == 0 diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index a3b3b442..7e694d57 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -93,7 +93,7 @@ def train( current_time = objs[0] # initialize model - model = initialize_model() + model, _ = initialize_model() # initialize loss function criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing) diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py index d9c6ac81..4d8afa28 100644 --- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py +++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py @@ -278,7 +278,7 @@ def exam_loss(args): seed_all(1024) # initialize model - model = initialize_model() + model, _ = initialize_model() # initialize loss function criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing) @@ -302,7 +302,7 @@ def exam_loss(args): SchedulerMetricHook( metric=metric, skip=( - gpc.is_using_pp() + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py index 507cace1..a985b985 100644 --- a/tests/test_training/train_CI.py +++ b/tests/test_training/train_CI.py @@ -124,7 +124,7 @@ def main(args): uniscale_logger = initialize_llm_logger(start_time=current_time) # initialize model - model = initialize_model() + model, _ = initialize_model() with open(args.config, "r") as f: config_lines = f.readlines() @@ -181,7 +181,7 @@ def main(args): SchedulerMetricHook( metric=metric, skip=( - gpc.is_using_pp() + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1 and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) From 0aebd2c1d95a863f32eaa2b3a21528d6620d9b20 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Mon, 22 Jan 2024 12:03:29 +0800 Subject: [PATCH 118/153] update moe config file --- configs/7B_MoE4_sft.py | 62 ++++++++++++++++++++++++----------- internlm/initialize/launch.py | 5 ++- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py index 0672422f..f42bcaa1 100644 --- a/configs/7B_MoE4_sft.py +++ b/configs/7B_MoE4_sft.py @@ -28,7 +28,7 @@ # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internlm", "llama", "hf_llama". load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) @@ -44,8 +44,8 @@ oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. ) -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" +TRAIN_FOLDER = None # "/path/to/dataset" +VALID_FOLDER = None # "/path/to/dataset" data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update @@ -59,12 +59,17 @@ pack_sample_into_one=False, total_steps=50000, skip_batches="", + # rampup_batch_size (str): A string with three space-separated integers representing the + # starting batch size, the increment, and the number of steps between + # each increment. For example, "192 24 8" means that the batch size (micro_num) + # starts at 192 and increases by 24 every 8 steps. Defaults to None. + # (IMPORTANT): The interval step size is 'micro_bsz'. rampup_batch_size="", # Datasets with less than 50 rows will be discarded min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + train_folder=TRAIN_FOLDER, + valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=200, diag_outlier_ratio=1.1, ) @@ -145,23 +150,36 @@ moe_use_residual=False, moe_gate_k=2, ) - -# zero1 parallel: -# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, -# so parameters will be divided within the range of dp. -# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. -# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. -# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. -# pipeline parallel (dict): -# 1. size: int, the size of pipeline parallel. -# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. -# tensor parallel: tensor parallel size, usually the number of GPUs per node. - +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'], + defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel. + msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size. + fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size. + isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +weight parallel (dict): + 1. size: int, the size of weight parallel. + 2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False. + 3. memory_pool: bool, enable/disable memory pool, defaults to False. +""" parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=1, + tensor=dict(size=1, mode="mtp"), pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=False, + weight=dict(size=1, overlap=True, memory_pool=True), ) cudnn_deterministic = False @@ -173,6 +191,10 @@ enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message light_monitor_address=None, # light_monitor address to send heartbeat + alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", + ), + tensorboard=dict( + queue_max_length=10, ), ) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index ce43bde8..a6030e5d 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -401,7 +401,10 @@ def args_sanity_check(): assert ( not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param ), "not support overlap and moe at the same time" - assert gpc.config.parallel.zero1.size == gpc.get_world_size(ParallelMode.DATA), "moe only support zero1" + assert gpc.config.parallel.zero1.size in ( + -1, + gpc.get_world_size(ParallelMode.DATA), + ), "moe only support zero1, set zero1=dict(size=-1,...) can fix this" def launch( From 15610f6bda988c2a4a949dad9d578d558b895c93 Mon Sep 17 00:00:00 2001 From: JiaoPL Date: Mon, 22 Jan 2024 13:27:41 +0800 Subject: [PATCH 119/153] adapt grad profiling --- internlm/solver/optimizer/utils.py | 143 ++++++++++++++++++----------- 1 file changed, 91 insertions(+), 52 deletions(-) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 22839adb..3dd510ff 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -237,6 +237,13 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False parallel_grads = [] if fine_grained: parallel_grads = {} + + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): + param_parallel_mode = ParallelMode.TENSOR + elif gpc.is_using_parallel_mode(weight_parallel_mode): + param_parallel_mode = weight_parallel_mode + else: + param_parallel_mode = ParallelMode.TENSOR def append_grad(g, p): if fine_grained: @@ -247,7 +254,7 @@ def append_grad(g, p): elif only_output: param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding" if ( - gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR) + gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(param_parallel_mode) and gpc.config.model["hidden_size"] == g.shape[1] and "embedding" not in param_name.lower() ): @@ -325,12 +332,27 @@ def compute_norm( total_norm_cuda = max(total_norm_cuda, previous_norm) # Take max across all model-parallel GPUs. - if gpc.get_world_size(ParallelMode.MODEL) > 1: + if is_tensor_data_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.TENSOR)) + elif is_tensor_zero_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.TENSOR)) + else: + if gpc.is_using_parallel_mode(weight_parallel_mode): + dist.all_reduce( + total_norm_cuda, + op=dist.ReduceOp.MAX, + group=gpc.get_group(weight_parallel_mode), + ) + + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): dist.all_reduce( total_norm_cuda, op=dist.ReduceOp.MAX, - group=gpc.get_group(ParallelMode.MODEL), + group=gpc.get_group(ParallelMode.PIPELINE), ) + total_norm = total_norm_cuda[0].item() else: tensor_parallel_grads = reduce_grads(gradients, parameters, weight_parallel_mode) @@ -417,19 +439,28 @@ def compute_vocab_grad_norm( norm_type=2, zero_mode=ParallelMode.ZERO1, ): + weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR enable_cuda_kernels = gradients[0].device.type == "cuda" # Norm parameters. norm_type = float(norm_type) vocab_size = gpc.config.model["vocab_size"] + + + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): + param_parallel_mode = ParallelMode.TENSOR + elif gpc.is_using_parallel_mode(weight_parallel_mode): + param_parallel_mode = weight_parallel_mode + else: + param_parallel_mode = ParallelMode.TENSOR - param_grads = reduce_grads(gradients, parameters, only_output=True) + param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, only_output=True) vocab_grad_norm = torch.zeros((vocab_size,), dtype=torch.float32).to(get_current_device()) if param_grads: for grad in param_grads: # get grad norm of each vocab vocab_slice_size = grad.shape[0] - local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + local_tp_rank = gpc.get_local_rank(param_parallel_mode) for i in range(vocab_slice_size): cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0] vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm( @@ -442,14 +473,18 @@ def compute_vocab_grad_norm( if previous_vocab_grad_norm is not None: vocab_grad_norm = vocab_grad_norm + previous_vocab_grad_norm - if gpc.is_initialized(ParallelMode.MODEL): - dist.all_reduce( - vocab_grad_norm, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.MODEL), - ) + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR)) + else: + if gpc.is_using_parallel_mode(weight_parallel_mode): + dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode)) + + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): + dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE)) - dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) + if gpc.is_using_parallel_mode(zero_mode): + dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) if zero_mode == ParallelMode.EXPERT_DATA: pg = gpc.get_group(ParallelMode.EXPERT) @@ -479,11 +514,28 @@ def compute_param_metric( Argumemts: metric_type: (norm | zero_grad) """ + + def reduce_param_metric(input_param_metrics: Dict, parallel_mode): + output_param_metrics = {} + parallel_param_metrics = [None for _ in range(gpc.get_world_size(parallel_mode))] + dist.all_gather_object(parallel_param_metrics, input_param_metrics, group=gpc.get_group(parallel_mode)) + for local_param_metric in parallel_param_metrics: + for param_name, param_metric in local_param_metric.items(): + if param_name not in output_param_metrics: + output_param_metrics[param_name] = 0.0 + if metric_type == "norm" and norm_type == inf: + output_param_metrics[param_name] = max( + output_param_metrics[param_name], param_metric + ) + else: + output_param_metrics[param_name] += param_metric + return output_param_metrics + weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR enable_cuda_kernels = gradients[0].device.type == "cuda" - total_metrics = {} + param_metrics = {} - param_grads = reduce_grads(gradients, parameters, fine_grained=True) + param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=True) if metric_type == "norm": # Norm parameters. @@ -510,65 +562,52 @@ def compute_param_metric( else: param_metrics[key] += value - # model parallel - model_parallel_param_metrics = {} - if gpc.is_initialized(ParallelMode.MODEL): - parallel_param_metrics = [None for _ in range(gpc.get_world_size(ParallelMode.MODEL))] - dist.all_gather_object(parallel_param_metrics, param_metrics, group=gpc.get_group(ParallelMode.MODEL)) - for local_param_metric in parallel_param_metrics: - for param_name, param_metric in local_param_metric.items(): - if param_name not in model_parallel_param_metrics: - model_parallel_param_metrics[param_name] = 0.0 - if metric_type == "norm" and norm_type == inf: - model_parallel_param_metrics[param_name] = max( - model_parallel_param_metrics[param_name], param_metric - ) - else: - model_parallel_param_metrics[param_name] += param_metric + # tensor parallel / weight parallel + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR) + elif gpc.is_using_parallel_mode(weight_parallel_mode): + param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode) + + # pipeline parallel + if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): + param_metrics = reduce_param_metric(param_metrics, ParallelMode.PIPELINE) # zero parallel - zero_param_metrics = [None for _ in range(gpc.get_world_size(zero_mode))] - dist.all_gather_object(zero_param_metrics, model_parallel_param_metrics, group=gpc.get_group(zero_mode)) - for local_param_metric in zero_param_metrics: - for param_name, param_metric in local_param_metric.items(): - if param_name not in total_metrics: - total_metrics[param_name] = 0.0 - if metric_type == "norm" and norm_type == inf: - total_metrics[param_name] = max(total_metrics[param_name], param_metric) - else: - total_metrics[param_name] += param_metric + if gpc.is_using_parallel_mode(zero_mode): + param_metrics = reduce_param_metric(param_metrics, zero_mode) # moe if zero_mode == ParallelMode.EXPERT_DATA: pg = gpc.get_group(ParallelMode.EXPERT) - total_metric_values = list(total_metrics.values()) - if isinstance(total_metric_values[0], torch.Tensor): - scaled_param_metric = torch.stack(total_metric_values).to(device=get_current_device()) + param_metric_values = list(param_metrics.values()) + if isinstance(param_metric_values[0], torch.Tensor): + scaled_param_metric = torch.stack(param_metric_values).to(device=get_current_device()) else: - scaled_param_metric = torch.cuda.FloatTensor(total_metric_values, device=get_current_device()) + scaled_param_metric = torch.cuda.FloatTensor(param_metric_values, device=get_current_device()) scaled_param_metric = scaled_param_metric / float(gpc.get_world_size(ParallelMode.EXPERT)) dist.all_reduce(scaled_param_metric, group=pg) - for i, param_name in enumerate(total_metrics.keys()): - total_metrics[param_name] = scaled_param_metric[i] + for i, param_name in enumerate(param_metrics.keys()): + param_metrics[param_name] = scaled_param_metric[i] # calc zero grad percent if metric_type == "zero_grad": - for param_name, param_metric in total_metrics.items(): - total_metrics[param_name] = (param_metric[0] / param_metric[1]).item() + for param_name, param_metric in param_metrics.items(): + param_metrics[param_name] = (param_metric[0] / param_metric[1]).item() # scale norm if metric_type == "norm": - for param_name, param_metric in total_metrics.items(): + for param_name, param_metric in param_metrics.items(): if torch.is_tensor(param_metric): param_metric = param_metric.item() if param_metric in (inf, -inf): - total_metrics[param_name] = -1 + param_metrics[param_name] = -1 elif math.isnan(param_metric): - total_metrics[param_name] = -2 + param_metrics[param_name] = -2 else: - total_metrics[param_name] = param_metric + param_metrics[param_name] = param_metric - return total_metrics + return param_metrics def compute_param_norm( From c8b100ed6f6dba0c0dcb182e64b8dd9e21be670c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 22 Jan 2024 14:09:24 +0800 Subject: [PATCH 120/153] fix(communication/isp.py): fix bias switch for mem pool --- internlm/core/communication/isp.py | 2 +- internlm/model/__init__.py | 3 ++- internlm/model/multi_head_attention.py | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index 0640f981..d0bbe2ed 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -209,7 +209,7 @@ def __init__( # init memory pool if necessary. if self.enable_memory_pool: - self.memory_pool = MemoryPool(model_conf) + self.memory_pool = MemoryPool(model_conf, with_bias=True) else: self.memory_pool = None diff --git a/internlm/model/__init__.py b/internlm/model/__init__.py index a4efc033..c10552c3 100644 --- a/internlm/model/__init__.py +++ b/internlm/model/__init__.py @@ -8,7 +8,7 @@ from .modeling_llama import build_model_with_cfg as build_model_with_llama_cfg from .modeling_moe import build_model_with_moe_cfg from .moe import MoE -from .multi_head_attention import MHA +from .multi_head_attention import MHA, DistributedAttention from .utils import gather_forward_split_backward __all__ = [ @@ -20,6 +20,7 @@ "ScaleColumnParallelLinear", "AccPerplex", "MHA", + "DistributedAttention", "gather_forward_split_backward", "build_model_with_cfg", "build_model_with_moe_cfg", diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 87e2d42a..825e3f21 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -187,6 +187,7 @@ def __init__( self.num_heads = num_heads assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads" self.head_dim = self.embed_dim // num_heads + self.tp_mode = tp_mode if self.rotary_emb_dim > 0: if self.use_dynamic_ntk_rope: @@ -204,7 +205,7 @@ def __init__( ) # notice here should change bias=True - Wqkv_cls = get_linear_cls(tp_mode, "column") + Wqkv_cls = get_linear_cls(self.tp_mode, "column") self.Wqkv = Wqkv_cls( embed_dim, 3 * embed_dim, @@ -220,14 +221,14 @@ def __init__( self.inner_cross_attn = inner_cross_attn_cls( causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout ) - if tp_mode == "isp": + if self.tp_mode == "isp": self.inner_attn = DistributedAttention(self.inner_attn, sequence_process_group=sequence_process_group) self.inner_cross_attn = DistributedAttention( self.inner_cross_attn, sequence_process_group=sequence_process_group ) # output projection always have the bias (for now) - out_proj_cls = get_linear_cls(tp_mode, "row") + out_proj_cls = get_linear_cls(self.tp_mode, "row") self.out_proj = out_proj_cls( embed_dim, embed_dim, From c606bb57a86238187ff636957eab4f12a5f1477b Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 22 Jan 2024 14:43:07 +0800 Subject: [PATCH 121/153] fix(model/utils.py): fix boolean value ambiguous error --- internlm/model/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 48eb4b78..c6ae7002 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -603,7 +603,7 @@ def backward(ctx, grad_output, *args): if ctx.needs_input_grad[1]: if grad_weight_sync: grad_weight_sync.wait() - if grad_bias and grad_bias_sync: + if grad_bias is not None and grad_bias_sync is not None: grad_bias_sync.wait() return grad_input, grad_weight, grad_bias, None, None, None, None, None, None From 70a17d62435c19a571df03316da73591630ad28c Mon Sep 17 00:00:00 2001 From: JiaoPL Date: Mon, 22 Jan 2024 17:39:33 +0800 Subject: [PATCH 122/153] test grad profiling with mtp,msp,fsp,isp --- .../solver/optimizer/hybrid_zero_optim.py | 79 ++++++++++++++++++- internlm/solver/optimizer/utils.py | 25 +++--- internlm/train/training_internlm.py | 2 +- 3 files changed, 91 insertions(+), 15 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index dac0453c..7cdbe3ff 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -661,13 +661,26 @@ def _compute_param_norm_stage( ): # compute norm for gradients that have been reduced params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) - + params_is_padding = False total_param_norms = {} if len(params) == 0: + params_is_padding = True dtype = self.param_groups[group_id]["dtype"] grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] + if group_id == 0: + for param in params: + if self.use_isp: + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) + else: + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + elif group_id == 1: + for param in params: + setattr(param, IS_TENSOR_DATA_PARALLEL, True) + else: + raise NotImplementedError("group_id > 1 is not yet implemented.") + if self._clip_grad_norm > 0: total_param_norms = compute_param_norm( grads, @@ -676,17 +689,43 @@ def _compute_param_norm_stage( previous_param_norms=previous_param_norms, zero_mode=self._broadcast_parallel_mode[group_id], ) + + if params_is_padding: + for param in params: + if hasattr(param, IS_REPLICA_ZERO_PARALLEL): + delattr(param, IS_REPLICA_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_DATA_PARALLEL): + delattr(param, IS_TENSOR_DATA_PARALLEL) + if hasattr(param, IS_TENSOR_ZERO_PARALLEL): + delattr(param, IS_TENSOR_ZERO_PARALLEL) + if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): + delattr(param, IS_WEIGHT_ZERO_PARALLEL) + return total_param_norms def _compute_vocab_grad_norm_stage( self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_vocab_grad_norm=None ): params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) + params_is_padding = False if len(params) == 0: + params_is_padding = True dtype = self.param_groups[group_id]["dtype"] grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] + if group_id == 0: + for param in params: + if self.use_isp: + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) + else: + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + elif group_id == 1: + for param in params: + setattr(param, IS_TENSOR_DATA_PARALLEL, True) + else: + raise NotImplementedError("group_id > 1 is not yet implemented.") + vocab_grad_norm = None if self._clip_grad_norm > 0: @@ -698,20 +737,44 @@ def _compute_vocab_grad_norm_stage( zero_mode=self._broadcast_parallel_mode[group_id], ) + if params_is_padding: + for param in params: + if hasattr(param, IS_REPLICA_ZERO_PARALLEL): + delattr(param, IS_REPLICA_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_DATA_PARALLEL): + delattr(param, IS_TENSOR_DATA_PARALLEL) + if hasattr(param, IS_TENSOR_ZERO_PARALLEL): + delattr(param, IS_TENSOR_ZERO_PARALLEL) + if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): + delattr(param, IS_WEIGHT_ZERO_PARALLEL) + return vocab_grad_norm def _count_zero_grads_stage( self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_zero_grad_count=None ): params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) - + params_is_padding = False total_zero_grad_count = {} if len(params) == 0: + params_is_padding = True dtype = self.param_groups[group_id]["dtype"] grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] + if group_id == 0: + for param in params: + if self.use_isp: + setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) + else: + setattr(param, IS_TENSOR_ZERO_PARALLEL, True) + elif group_id == 1: + for param in params: + setattr(param, IS_TENSOR_DATA_PARALLEL, True) + else: + raise NotImplementedError("group_id > 1 is not yet implemented.") + if self._clip_grad_norm > 0: total_zero_grad_count = compute_zero_grad_count( grads, @@ -720,6 +783,18 @@ def _count_zero_grads_stage( previous_zero_grad_count=previous_zero_grad_count, zero_mode=self._broadcast_parallel_mode[group_id], ) + + if params_is_padding: + for param in params: + if hasattr(param, IS_REPLICA_ZERO_PARALLEL): + delattr(param, IS_REPLICA_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_DATA_PARALLEL): + delattr(param, IS_TENSOR_DATA_PARALLEL) + if hasattr(param, IS_TENSOR_ZERO_PARALLEL): + delattr(param, IS_TENSOR_ZERO_PARALLEL) + if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): + delattr(param, IS_WEIGHT_ZERO_PARALLEL) + return total_zero_grad_count @llm_timeout(func_name="optim_step") diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 3dd510ff..a1964282 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -237,7 +237,7 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False parallel_grads = [] if fine_grained: parallel_grads = {} - + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): param_parallel_mode = ParallelMode.TENSOR elif gpc.is_using_parallel_mode(weight_parallel_mode): @@ -444,8 +444,7 @@ def compute_vocab_grad_norm( # Norm parameters. norm_type = float(norm_type) vocab_size = gpc.config.model["vocab_size"] - - + if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): param_parallel_mode = ParallelMode.TENSOR elif gpc.is_using_parallel_mode(weight_parallel_mode): @@ -478,10 +477,10 @@ def compute_vocab_grad_norm( dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR)) else: if gpc.is_using_parallel_mode(weight_parallel_mode): - dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode)) + dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(weight_parallel_mode)) if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): - dist.all_reduce(vocab_grad_norm,op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE)) + dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE)) if gpc.is_using_parallel_mode(zero_mode): dist.all_reduce(vocab_grad_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) @@ -514,7 +513,7 @@ def compute_param_metric( Argumemts: metric_type: (norm | zero_grad) """ - + def reduce_param_metric(input_param_metrics: Dict, parallel_mode): output_param_metrics = {} parallel_param_metrics = [None for _ in range(gpc.get_world_size(parallel_mode))] @@ -524,9 +523,7 @@ def reduce_param_metric(input_param_metrics: Dict, parallel_mode): if param_name not in output_param_metrics: output_param_metrics[param_name] = 0.0 if metric_type == "norm" and norm_type == inf: - output_param_metrics[param_name] = max( - output_param_metrics[param_name], param_metric - ) + output_param_metrics[param_name] = max(output_param_metrics[param_name], param_metric) else: output_param_metrics[param_name] += param_metric return output_param_metrics @@ -563,11 +560,15 @@ def reduce_param_metric(input_param_metrics: Dict, parallel_mode): param_metrics[key] += value # tensor parallel / weight parallel - if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): + if is_tensor_data_parallel_parameter(parameters[0]): if gpc.is_using_parallel_mode(ParallelMode.TENSOR): param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR) - elif gpc.is_using_parallel_mode(weight_parallel_mode): - param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode) + elif is_tensor_zero_parallel_parameter(parameters[0]): + if gpc.is_using_parallel_mode(ParallelMode.TENSOR): + param_metrics = reduce_param_metric(param_metrics, ParallelMode.TENSOR) + else: + if gpc.is_using_parallel_mode(weight_parallel_mode): + param_metrics = reduce_param_metric(param_metrics, weight_parallel_mode) # pipeline parallel if gpc.is_using_parallel_mode(ParallelMode.PIPELINE): diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index d5f124f4..108a918b 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -62,7 +62,7 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile, get_current_device +from internlm.utils.common import DummyProfile, get_current_device, launch_time from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( From 4e9b27664fa740df38008eb5f1462129b1341a68 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 22 Jan 2024 18:15:14 +0800 Subject: [PATCH 123/153] feat(training_internlm.py): update initialize_model func to adapt to private repo --- internlm/train/training_internlm.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index d5f124f4..b6db7b22 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -31,7 +31,7 @@ ) from internlm.core.context import global_context as gpc from internlm.core.context.random import set_mode -from internlm.core.naive_amp import NaiveAMPModel +from internlm.core.naive_amp import NaiveAMPModel, set_fp32_attr_to_module from internlm.core.trainer import TrainState from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn @@ -81,7 +81,17 @@ logger = get_logger(__file__) -def set_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]): +def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]): + if not isinstance(model, nn.ModuleList): + model = [model] + + for _chunk in model: + for _, module in _chunk.named_modules(): + if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.model.get("use_fp32_norm", False): + set_fp32_attr_to_module(module) + + +def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]): def _check_module(module): # layer_norm if isinstance(module, (RMSNorm, nn.LayerNorm)): @@ -111,6 +121,7 @@ def _check_module(module): if isinstance(_chunk, NaiveAMPModel): _chunk = _chunk.model + # set param parallel attribute for name, module in _chunk.named_modules(): _check_module(module) @@ -124,7 +135,7 @@ def _check_module(module): @llm_timeout(func_name="initialize_model") -def initialize_model(): +def initialize_model(pre_process_func: Optional[Callable] = None, post_process_func: Optional[Callable] = None): """ Initialize model with Automatic Mixed Precision. @@ -132,8 +143,15 @@ def initialize_model(): torch.nn.Module: The neural network model to be trained or evaluated. """ - + if pre_process_func: + pre_process_output = pre_process_func() model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model)) + if post_process_func: + post_process_func(pre_process_output) + + # should be set before NaiveAMPModel + set_fp32_attr_for_model(model) + if isinstance(model, nn.ModuleList): model = nn.ModuleList( [ @@ -154,7 +172,7 @@ def initialize_model(): sync_buffer=False, ) - set_attr_for_param_groups(model) + set_parallel_attr_for_param_groups(model) # This sync is very important, cause the model weights kept in optimizer are copied # from the origin parameters in the memory, so we should make sure the dp sync From 32df5ad2cdb2e30a33fc6b10b4e57f9ed8f775af Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 22 Jan 2024 19:42:19 +0800 Subject: [PATCH 124/153] feat(training_internlm.py): move get_scheduler_hooks from train.py to training_internlm.py --- internlm/model/modeling_internlm.py | 1 + internlm/train/__init__.py | 2 ++ internlm/train/training_internlm.py | 28 ++++++++++++++++++++++++++-- train.py | 23 +---------------------- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 01f647e1..6a237641 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -507,6 +507,7 @@ def build_model_with_cfg( dropout_selective_checkpoint=True, use_scaled_init: bool = True, use_swiglu: bool = True, + use_fp32_norm: bool = True, use_flash_attn: bool = True, rope_base: int = 10000, ): diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py index 1fd08028..90bb5d86 100644 --- a/internlm/train/__init__.py +++ b/internlm/train/__init__.py @@ -7,6 +7,7 @@ load_new_batch, record_current_batch_training_metrics, wrap_FSDP_model, + get_scheduler_hooks, ) __all__ = [ @@ -18,4 +19,5 @@ "load_new_batch", "record_current_batch_training_metrics", "wrap_FSDP_model", + "get_scheduler_hooks", ] diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index b6db7b22..88d404a8 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -6,7 +6,7 @@ import pickle import time from functools import partial -from typing import Callable, Iterable, Optional, Union +from typing import Callable, Iterable, Optional, Union, List import torch import torch.distributed as dist @@ -62,7 +62,7 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile, get_current_device +from internlm.utils.common import DummyProfile, get_current_device, SchedulerHook from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( @@ -76,6 +76,8 @@ ) from internlm.utils.registry import MODEL_INITIALIZER from internlm.utils.timeout import llm_timeout +from internlm.model.metrics import SchedulerMetricHook +from internlm.core.communication.isp import ISPCommunicatorSchedulerHook RMSNorm = try_import_RMSNorm() logger = get_logger(__file__) @@ -303,6 +305,28 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicato return optimizer, beta2_scheduler, lr_scheduler +def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]: + scheduler_hooks: List[SchedulerHook] = [] + + if metric is not None: + scheduler_hooks.append( + SchedulerMetricHook( + metric=metric, + skip=( + gpc.is_using_parallel_mode(ParallelMode.PIPELINE) + and hasattr(gpc.config.model, "num_chunks") + and gpc.config.model.num_chunks > 1 + and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) + ), + ), + ) + + if isp_communicator is not None: + scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim)) + + return scheduler_hooks + + @llm_timeout(func_name="get_train_data_loader") def get_train_data_loader(num_worker: int = 0, dataset_generate_func: Optional[Callable] = None): """ diff --git a/train.py b/train.py index 46775ac9..bd931890 100644 --- a/train.py +++ b/train.py @@ -29,6 +29,7 @@ initialize_optimizer, load_new_batch, record_current_batch_training_metrics, + get_scheduler_hooks, ) from internlm.utils.common import ( BatchSkipper, @@ -71,28 +72,6 @@ def initialize_llm_logger(start_time: str): return uniscale_logger -def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerHook]: - scheduler_hooks: List[SchedulerHook] = [] - - if metric is not None: - scheduler_hooks.append( - SchedulerMetricHook( - metric=metric, - skip=( - gpc.is_using_parallel_mode(ParallelMode.PIPELINE) - and hasattr(gpc.config.model, "num_chunks") - and gpc.config.model.num_chunks > 1 - and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) - ), - ), - ) - - if isp_communicator is not None: - scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim)) - - return scheduler_hooks - - def main(args): # init setting skip_batches = gpc.config.data.skip_batches From d388ddc2ecf5ad633d73fe6f58536ccf2bf1d96f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 23 Jan 2024 11:05:06 +0800 Subject: [PATCH 125/153] feat(model): fix dict has no attri mode error --- internlm/model/modeling_internlm.py | 4 +++- internlm/solver/optimizer/hybrid_zero_optim.py | 5 ++++- internlm/solver/optimizer/utils.py | 4 +++- internlm/train/training_internlm.py | 16 ++++++++++------ internlm/train/utils.py | 2 +- internlm/utils/model_checkpoint.py | 16 ++++++++-------- internlm/utils/parallel.py | 14 +++++++++----- 7 files changed, 38 insertions(+), 23 deletions(-) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 6a237641..934981f2 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -307,7 +307,9 @@ def __init__( super().__init__() checkpoint_layer_num = int(num_layers * checkpoint) - self.tp_mode = gpc.config.parallel.tensor.mode + self.tp_mode = "mtp" + if isinstance(gpc.config.parallel.tensor, dict): + self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp") if is_reward: head_cls = RewardModelLinear diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index dac0453c..056bb6de 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -84,7 +84,10 @@ def __init__( clip_grad_norm = zero_cfg.clip_grad_norm self._overlap_sync_grad = zero_cfg.overlap_sync_grad self._overlap_sync_param = zero_cfg.overlap_sync_param - self.use_isp = gpc.config.parallel.tensor.mode == "isp" + self.use_isp = ( + isinstance(gpc.config.parallel["tensor"], dict) + and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" + ) super().__init__(optim=optimizer) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 22839adb..31073336 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -308,7 +308,9 @@ def compute_norm( Total norm of the parameters, need total_norm**(1/norm) before using. """ - weight_parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor.mode == "isp" else ParallelMode.TENSOR + weight_parallel_mode = ( + ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR + ) enable_cuda_kernels = gradients[0].device.type == "cuda" # Norm parameters. norm_type = float(norm_type) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 88d404a8..17e42418 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -94,6 +94,8 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]): def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]): + tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp") + def _check_module(module): # layer_norm if isinstance(module, (RMSNorm, nn.LayerNorm)): @@ -103,17 +105,17 @@ def _check_module(module): # embedding and head if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)): for param in module.parameters(): - if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode == "isp": + if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode == "isp": setattr(param, IS_TENSOR_DATA_PARALLEL, True) - elif gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp": + elif gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp": setattr(param, IS_TENSOR_ZERO_PARALLEL, True) # for linear module if isinstance(module, (ColumnParallelLinear, RowParallelLinear)): for param in module.parameters(): - if gpc.is_initialized(ParallelMode.TENSOR) and gpc.config.parallel.tensor.mode != "isp": + if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp": setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif gpc.is_initialized(ParallelMode.WEIGHT) and gpc.config.parallel.tensor.mode == "isp": + elif gpc.is_initialized(ParallelMode.WEIGHT) and tp_mode == "isp": setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) if not isinstance(model, nn.ModuleList): @@ -187,13 +189,15 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. - random_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA + random_mode = ( + ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA + ) set_mode(random_mode) # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - if gpc.config.parallel.tensor.mode != "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": isp_communicator = None else: isp_communicator = ISPCommunicator( diff --git a/internlm/train/utils.py b/internlm/train/utils.py index ed4b7415..97f49f74 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -126,7 +126,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy pgroup["optimizer_mode"] = ParallelMode.ZERO1 # param groups may contain empty groups, such as embed_head - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": param_groups.extend(new_groups.values()) else: assert len(new_groups["embed_head"]["params"]) <= 0 diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 322ddf1e..62ee66aa 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -325,7 +325,7 @@ def save_model_checkpoint(folder, model): # even if pp is not considered, it will definitely not be written on the same machine. # for tensor parallel mode with isp - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": if wdp_rank == 0 or dp_rank == 0: fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" fp = os.path.join(folder, fn) @@ -564,7 +564,7 @@ def load_model_checkpoint(folder, model): for fn in fns: if fn.startswith("model_t") and not fn.endswith(".md5"): segements = os.path.splitext(fn)[0].split("_") - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": max_pp = max(max_pp, int(segements[-1][2:])) max_wp = max(max_wp, int(segements[-2][2:])) max_tp = max(max_tp, int(segements[-3][2:])) @@ -590,7 +590,7 @@ def load_model_checkpoint(folder, model): dp_size == max_zo + 1 ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards" - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" elif gpc.config.parallel.zero1.fsdp: should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt" @@ -702,7 +702,7 @@ def save_optimizer_checkpoint(optim, state_path): states = optim.state_dict() if isinstance(optim, HybridZeroOptimizer): - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" llm_save(os.path.join(state_path, fp), states) else: @@ -752,7 +752,7 @@ def load_optimizer_checkpoint(folder, optim): max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0 for fn in fns: if fn.startswith("optimizer_") and not fn.endswith(".md5"): - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_") max_dp = max(max_dp, int(dp[2:])) max_tp = max(max_tp, int(tp[2:])) @@ -770,12 +770,12 @@ def load_optimizer_checkpoint(folder, optim): pp_size = gpc.get_world_size(ParallelMode.PIPELINE) dp_size = gpc.get_world_size(ParallelMode.DATA) - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": assert dp_size == max_dp + 1, ( f"The optimizer states are save for {max_dp+1} data parallelism, " f"while current has {dp_size} data parallelism" ) - if gpc.config.parallel.tensor.mode != "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": assert zero_size == max_zero + 1, ( f"The optimizer states are save for {max_zero+1} zero parallel, " f"while current has {zero_size} zero broadcast range." @@ -795,7 +795,7 @@ def load_optimizer_checkpoint(folder, optim): wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) dp_rank = gpc.get_local_rank(ParallelMode.DATA) - if gpc.config.parallel.tensor.mode == "isp": + if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" else: fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt" diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 2614fe11..703c5dd7 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -26,7 +26,7 @@ def is_replica_zero_parallel_parameter(p): def is_tensor_data_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.TENSOR) - and gpc.config.parallel.tensor.mode == "isp" + and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" and hasattr(p, IS_TENSOR_DATA_PARALLEL) and getattr(p, IS_TENSOR_DATA_PARALLEL) ) @@ -35,7 +35,7 @@ def is_tensor_data_parallel_parameter(p): def is_tensor_zero_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.TENSOR) - and gpc.config.parallel.tensor.mode != "isp" + and gpc.config.parallel["tensor"].get("mode", "mtp") != "isp" and hasattr(p, IS_TENSOR_ZERO_PARALLEL) and getattr(p, IS_TENSOR_ZERO_PARALLEL) ) @@ -44,7 +44,7 @@ def is_tensor_zero_parallel_parameter(p): def is_weight_zero_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.WEIGHT) - and gpc.config.parallel.tensor.mode == "isp" + and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" and hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL) ) @@ -58,7 +58,9 @@ def sync_model_param(model): """ sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA) - sync_parallel_mode = ParallelMode.WEIGHT_DATA if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.DATA + sync_parallel_mode = ( + ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA + ) for param in model.parameters(): if sync_moe_param and getattr(param, "is_expert", False): ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA) @@ -79,7 +81,9 @@ def sync_model_replica_param_group(model): model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - parallel_mode = ParallelMode.WEIGHT if gpc.config.parallel.tensor["mode"] == "isp" else ParallelMode.TENSOR + parallel_mode = ( + ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR + ) if gpc.is_using_parallel_mode(parallel_mode): for param in model.parameters(): if is_replica_zero_parallel_parameter(param): From 8e1b6199386b7da97714e1e1633a282cad4b2fa7 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 23 Jan 2024 15:28:41 +0800 Subject: [PATCH 126/153] feat(training_internlm.py): move use_fp32_norm config to gpc.config --- configs/7B_sft.py | 1 + internlm/model/modeling_internlm.py | 1 - internlm/train/training_internlm.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 66ffe0d0..615cd6c3 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -129,6 +129,7 @@ cur_iter=-1, ) +use_fp32_norm = False model = dict( checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] num_attention_heads=NUM_ATTENTION_HEAD, diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 934981f2..8ba10d0e 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -509,7 +509,6 @@ def build_model_with_cfg( dropout_selective_checkpoint=True, use_scaled_init: bool = True, use_swiglu: bool = True, - use_fp32_norm: bool = True, use_flash_attn: bool = True, rope_base: int = 10000, ): diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 17e42418..f5e2712b 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -89,7 +89,7 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]): for _chunk in model: for _, module in _chunk.named_modules(): - if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.model.get("use_fp32_norm", False): + if isinstance(module, (RMSNorm, nn.LayerNorm)) and gpc.config.get("use_fp32_norm", False): set_fp32_attr_to_module(module) From 978cea8b6dacb0db2cdc8f6c5bb4aa1f8c827e4c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 12:38:46 +0800 Subject: [PATCH 127/153] feat(version): update internevo version and torch verion --- requirements/runtime.txt | 2 +- requirements/torch.txt | 8 ++++---- version.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 814f69bb..d5922304 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -14,4 +14,4 @@ botocore torch-scatter pyecharts py-libnuma --f https://data.pyg.org/whl/torch-1.13.1+cu117.html +-f https://data.pyg.org/whl/torch-2.1.0+cu118.html diff --git a/requirements/torch.txt b/requirements/torch.txt index 4b1efcb7..c9a04b3d 100644 --- a/requirements/torch.txt +++ b/requirements/torch.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/cu117 -torch==1.13.1+cu117 -torchvision==0.14.1+cu117 -torchaudio==0.13.1 +--extra-index-url https://download.pytorch.org/whl/cu118 +torch==2.1.0+cu118 +torchvision==0.16.0+cu118 +torchaudio==2.1.0+cu118 diff --git a/version.txt b/version.txt index 0ea3a944..0d91a54c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.2.0 +0.3.0 From d5fe8fe6873e5515722e574d791f0ecc369b2181 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 13:09:21 +0800 Subject: [PATCH 128/153] feat(context/parallel_context.py): set default parallel size in parallel context to fix e2e tests --- internlm/core/context/parallel_context.py | 11 +++++++++++ internlm/initialize/launch.py | 10 +--------- internlm/model/modeling_internlm.py | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 9cc6bcdd..faa49dff 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -477,6 +477,17 @@ def init_parallel_groups(self): # set parallel size as attributes for global context parallel_config = self.config.get("parallel", None) if parallel_config is not None: + # set default value for parallel size + if "zero1" not in parallel_config: + parallel_config._add_item("zero1", dict(size=-1, fsdp=False)) + if "pipeline" not in parallel_config: + parallel_config._add_item("pipeline", dict(size=1, interleaved_overlap=False)) + if "tensor" not in parallel_config: + parallel_config._add_item("tensor", dict(size=1, mode="mtp")) + if "weight" not in parallel_config: + parallel_config._add_item("weight", dict(size=1, overlap=False, memory_pool=False)) + + # get value from config self._set_parallel_size_from_config(parallel_config, "weight", "weight_parallel_size") self._set_parallel_size_from_config(parallel_config, "tensor", "tensor_parallel_size") self._set_parallel_size_from_config(parallel_config, "pipeline", "pipeline_parallel_size") diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index ed3dcad5..041445ca 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -466,7 +466,7 @@ def launch( logger.info( f"Distributed environment is initialized, " f"data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, " - f"tensor parallel size: {gpc.tensor_parallel_size}", + f"tensor parallel size: {gpc.tensor_parallel_size}, weight parallel size: {gpc.weight_parallel_size}", ) if gpc.config.model.get("num_experts", 1) > 1: logger.info( @@ -475,14 +475,6 @@ def launch( f"number of local experts: {gpc.config.model.num_experts//gpc.expert_parallel_size}" ) - print( - f"global_rank:{gpc.get_global_rank()} wp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT)} " - f"sp_rank:{gpc.get_local_rank(ParallelMode.SEQUENCE)} pp_rank:{gpc.get_local_rank(ParallelMode.PIPELINE)} " - f"zo1_rank:{gpc.get_local_rank(ParallelMode.ZERO1)} dp_rank:{gpc.get_local_rank(ParallelMode.DATA)} " - f"wdp_rank:{gpc.get_local_rank(ParallelMode.WEIGHT_DATA)}", - flush=True, - ) - def launch_from_slurm( config: Union[str, Path, Config, Dict], diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 8ba10d0e..f06d6532 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -308,8 +308,8 @@ def __init__( checkpoint_layer_num = int(num_layers * checkpoint) self.tp_mode = "mtp" - if isinstance(gpc.config.parallel.tensor, dict): - self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp") + if isinstance(gpc.config.parallel["tensor"], dict): + self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp") if is_reward: head_cls = RewardModelLinear From 1d64a22b52fe21435dab780365f1a71b6d068716 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 14:49:58 +0800 Subject: [PATCH 129/153] feat(format): fix ci lint check error --- internlm/initialize/initialize_trainer.py | 2 +- internlm/model/metrics.py | 2 +- internlm/model/modeling_llama.py | 23 +--------------- internlm/train/__init__.py | 2 +- internlm/train/training_internlm.py | 33 +++++------------------ internlm/utils/common.py | 3 ++- internlm/utils/gputest.py | 9 +++---- train.py | 6 +---- 8 files changed, 18 insertions(+), 62 deletions(-) diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py index 4827fbcf..91fddebe 100644 --- a/internlm/initialize/initialize_trainer.py +++ b/internlm/initialize/initialize_trainer.py @@ -25,7 +25,7 @@ from internlm.data.utils import unpack_data from internlm.solver.beta2_scheduler import Beta2Scheduler from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer -from internlm.utils.common import get_current_device, SchedulerHook +from internlm.utils.common import SchedulerHook, get_current_device def initialize_trainer( diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py index a19bbee0..aebdc13d 100644 --- a/internlm/model/metrics.py +++ b/internlm/model/metrics.py @@ -6,8 +6,8 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc -from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.common import SchedulerHook +from internlm.utils.megatron_timers import megatron_timer as timer class AccPerplex: diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py index 2638ce27..5f999c8f 100644 --- a/internlm/model/modeling_llama.py +++ b/internlm/model/modeling_llama.py @@ -168,11 +168,6 @@ def __init__( sequence_parallel=sequence_parallel, **factory_kwargs, ) - # need to assign tp attribute so that internlm know it is tensor parallel module - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - for name in ["wo", "wq", "wk", "wv"]: - for param in getattr(self, name).parameters(): - setattr(param, IS_TENSOR_PARALLEL, True) def forward(self, x, seqlen=None, inference_params=None, **kwargs): if kwargs.get("indexes", None) is not None: @@ -594,16 +589,6 @@ def __init__( dtype=dtype, ) - for _, param in self.feed_forward.named_parameters(): - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) - for param in self.attention_norm.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) - for param in self.ffn_norm.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) - self.dropout2 = nn.Dropout(drop_rate) self.use_swiglu = use_swiglu self.use_scaled_init = use_scaled_init @@ -857,9 +842,8 @@ def __init__( normal_(std=embedding_init_std)(param) else: uniform_(std=embedding_init_std)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) self.embed_grad_scale = embed_grad_scale + self.layers = nn.ModuleList( [ PackedFlashLlamaLayer1D( @@ -901,9 +885,6 @@ def __init__( self.norm = RMSNorm(hidden_size, eps=layer_norm_epsilon) else: self.norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) - for param in self.norm.parameters(): - if gpc.config.parallel.sequence_parallel is True: - setattr(param, IS_SEQUENCE_PARALLEL, True) self.output = head_cls( in_features=hidden_size, @@ -920,8 +901,6 @@ def __init__( normal_(std=out_head_init_std)(param) else: uniform_(std=out_head_init_std)(param) - if gpc.get_world_size(ParallelMode.TENSOR) > 1: - setattr(param, IS_TENSOR_PARALLEL, True) self.parallel_output = parallel_output diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py index 90bb5d86..e4f049d7 100644 --- a/internlm/train/__init__.py +++ b/internlm/train/__init__.py @@ -1,4 +1,5 @@ from .training_internlm import ( + get_scheduler_hooks, get_train_data_loader, get_validation_data_loader, initialize_llm_profile, @@ -7,7 +8,6 @@ load_new_batch, record_current_batch_training_metrics, wrap_FSDP_model, - get_scheduler_hooks, ) __all__ = [ diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 9c0474e8..6120974c 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -69,7 +69,12 @@ from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer from internlm.solver.optimizer.utils import ParamBcastSyncHandler from internlm.train.utils import create_param_groups -from internlm.utils.common import DummyProfile, SchedulerHook, get_current_device +from internlm.utils.common import ( + DummyProfile, + SchedulerHook, + get_current_device, + launch_time, +) from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( @@ -509,30 +514,6 @@ def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: Trai return batch, train_iter -# def initialize_llm_profile(profiling: bool = False, start_time: str = None): -# """Initialize and return the profiler context manager instance.""" - -# if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0: -# llm_profile = torch.profiler.profile -# logger.info(f"Do profiling in rank {gpc.get_global_rank()}!") -# else: -# llm_profile = DummyProfile - -# return llm_profile( -# activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], -# schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1), -# on_trace_ready=torch.profiler.tensorboard_trace_handler( -# f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" -# + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" -# + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_" -# + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}", -# ), -# with_stack=True, -# with_modules=True, -# profile_memory=True, -# ) - - def initialize_llm_profile(profiling: bool = False, start_time: str = None): """Initialize and return the profiler context manager instance.""" @@ -549,7 +530,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" + f"wp{gpc.get_local_rank(ParallelMode.WEIGHT)}_" - + f"sp{gpc.get_local_rank(ParallelMode.SEQUENCE)}", + + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}", ), with_stack=True, with_modules=True, diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 9dc4efea..7ef57278 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -1,14 +1,15 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from abc import ABC, abstractmethod import bisect import inspect import os import random +from abc import ABC, abstractmethod from contextlib import contextmanager from datetime import datetime from typing import Union + import numpy as np import torch diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index becc9d85..39c7341e 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -273,11 +273,6 @@ def bench_gpu(use_flash_attn=True): ) -""" -Useful utility functions migrated from deepseped. -""" - - def warmup_process_group(): # Prevent OOM from nccl communication. if dist.is_initialized(): @@ -305,6 +300,10 @@ def warmup_process_group(): def cuda_memory_analyze(step=0, print_mm_suage=False): + """ + Useful utility functions migrated from deepseped. + """ + global n_caching_allocator_flushes torch.cuda.synchronize() diff --git a/train.py b/train.py index bd931890..e0e99aff 100644 --- a/train.py +++ b/train.py @@ -1,12 +1,10 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from pickle import FALSE import socket import time import traceback from functools import partial -from typing import List import torch import torch.distributed as dist @@ -17,8 +15,7 @@ from internlm.core.trainer import TrainState from internlm.initialize import initialize_distributed_env from internlm.model.loss import FlashGPTLMLoss -from internlm.model.metrics import AccPerplex, SchedulerMetricHook -from internlm.core.communication.isp import ISPCommunicatorSchedulerHook +from internlm.model.metrics import AccPerplex from internlm.monitor import initialize_monitor_manager, send_alert_message from internlm.monitor.monitor import monitor_manager as mm from internlm.train import ( @@ -37,7 +34,6 @@ get_megatron_flops_2, launch_time, parse_args, - SchedulerHook, ) from internlm.utils.evaluation import evaluate_on_val_dls from internlm.utils.gputest import empty_cache_and_diag From b0c6a20101908379baa7b3e10d5d188a908dc5ea Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 14:53:45 +0800 Subject: [PATCH 130/153] feat(format): fix ci lint check error --- internlm/core/scheduler/no_pipeline_scheduler.py | 2 +- internlm/core/scheduler/pipeline_scheduler.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/internlm/core/scheduler/no_pipeline_scheduler.py b/internlm/core/scheduler/no_pipeline_scheduler.py index cb8ff780..0cd8c103 100644 --- a/internlm/core/scheduler/no_pipeline_scheduler.py +++ b/internlm/core/scheduler/no_pipeline_scheduler.py @@ -11,7 +11,7 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.engine import Engine -from internlm.utils.common import conditional_context, SchedulerHook +from internlm.utils.common import SchedulerHook, conditional_context from internlm.utils.logger import get_logger from internlm.utils.timeout import llm_timeout diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index d29c54dc..03daca29 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -14,7 +14,12 @@ from internlm.core.context import global_context as gpc from internlm.core.engine import Engine from internlm.core.naive_amp import NaiveAMPModel -from internlm.utils.common import check_data_is_packed, get_current_device, move_to_device, SchedulerHook +from internlm.utils.common import ( + SchedulerHook, + check_data_is_packed, + get_current_device, + move_to_device, +) from internlm.utils.logger import get_logger from internlm.utils.timeout import llm_timeout From 571d83c0542fcda802bc90d888f0ed5ce7d504ed Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 14:56:39 +0800 Subject: [PATCH 131/153] feat(format): fix ci lint check error --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index e0e99aff..720a88f9 100644 --- a/train.py +++ b/train.py @@ -19,6 +19,7 @@ from internlm.monitor import initialize_monitor_manager, send_alert_message from internlm.monitor.monitor import monitor_manager as mm from internlm.train import ( + get_scheduler_hooks, get_train_data_loader, get_validation_data_loader, initialize_llm_profile, @@ -26,7 +27,6 @@ initialize_optimizer, load_new_batch, record_current_batch_training_metrics, - get_scheduler_hooks, ) from internlm.utils.common import ( BatchSkipper, From 83517ca37e0d7fcc69478ee97617248ac642b164 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 24 Jan 2024 19:30:09 +0800 Subject: [PATCH 132/153] feat(evaluation.py): fix evaluation error when msp/fsp with pp --- internlm/model/modeling_moe.py | 4 ++-- internlm/solver/optimizer/hybrid_zero_optim.py | 8 -------- internlm/train/training_internlm.py | 2 +- internlm/train/utils.py | 15 ++++++++------- internlm/utils/evaluation.py | 10 ++++++---- 5 files changed, 17 insertions(+), 22 deletions(-) diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index 2db6f727..5b6e85bf 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -373,8 +373,8 @@ def __init__( checkpoint_layer_num = int(num_layers * checkpoint) self.tp_mode = "mtp" - if isinstance(gpc.config.parallel.tensor, dict): - self.tp_mode = gpc.config.parallel.tensor.get("mode", "mtp") + if isinstance(gpc.config.parallel["tensor"], dict): + self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp") if is_reward: head_cls = RewardModelLinear diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index b5ec92b0..21e93af2 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -164,15 +164,7 @@ def __init__( # add the fp16 params to fp16_param_groups for bookkeeping self._fp16_param_groups[group_id] = group_params - # to find real zero mode. if zero is not used, set all param group as ParallelMode.ZERO1 - # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode - # zero_mode = ( - # ParallelMode.ZERO1 - # if gpc.get_world_size(ParallelMode.ZERO1) == 1 or param_group["dp_mode"] == ParallelMode.DATA - # else ParallelMode.EXPERT_DATA - # ) zero_mode = param_group["optimizer_mode"] - self._zero_local_rank.append(gpc.get_local_rank(zero_mode)) self._zero_world_size.append(gpc.get_world_size(zero_mode)) # TODO _broadcast_parallel_mode is not only used in broadcast, maybe can change its name diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 6120974c..c13abfc3 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -344,7 +344,7 @@ def get_scheduler_hooks(metric, zero_optim, isp_communicator) -> List[SchedulerH ), ) - if isp_communicator is not None: + if isp_communicator is not None and gpc.config.parallel["weight"].get("overlap", False): scheduler_hooks.append(ISPCommunicatorSchedulerHook(isp_communicator, zero_optim)) return scheduler_hooks diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 4444b30d..2f57f11a 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -8,7 +8,7 @@ from internlm.utils.parallel import is_tensor_data_parallel_parameter -def split_params_into_different_groups_for_optimizer_with_new_partition_strategy( +def split_params_into_different_groups_for_optimizer( param_groups: Tuple[Dict], ) -> Tuple[Dict]: """Split parameters into different groups for optimizer @@ -24,8 +24,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy Tuple[Dict]: list of params groups for optimizer Output Example: >>> ( - >>> {'name': 'default','params': [tensor],'weight_decay' :xxx}, - >>> {'name': 'embed_head', 'params': [tensor],'weight_decay' :xxx}, + >>> {'name': 'default', 'params': [tensor], 'weight_decay' :xxx}, + >>> {'name': 'embed_head', 'params': [tensor], 'weight_decay' :xxx}, + >>> {'name': 'fp32', 'params': [tensor], 'weight_decay' :xxx}, >>> ) """ @@ -38,8 +39,9 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy # create new groups for IS_TENSOR_DATA_PARALLEL parameter group new_groups = {} - if gpc.config.parallel.tensor.mode == "isp": + if isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} + # create new groups for fp32 parameter group new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1} if gpc.config.model.get("num_experts", 1) > 1: @@ -71,7 +73,7 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy pgroup["params"] = origin_params pgroup["optimizer_mode"] = ParallelMode.ZERO1 - # param groups may contain empty groups, such as embed_head + # param groups may contain empty groups, such as fp32 param_groups.extend(new_groups.values()) return tuple(param_groups) @@ -79,5 +81,4 @@ def split_params_into_different_groups_for_optimizer_with_new_partition_strategy def create_param_groups(model, weight_decay): parameters = {"params": list(model.parameters()), "name": "default", "weight_decay": weight_decay} - # return split_params_into_different_groups_for_optimizer(parameters) - return split_params_into_different_groups_for_optimizer_with_new_partition_strategy(parameters) + return split_params_into_different_groups_for_optimizer(parameters) diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index e52586b8..1c1515b4 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -50,10 +50,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape def switch_sequence_parallel_mode(): prev_mode = gpc.config.parallel.sequence_parallel try: - if gpc.config.parallel["tensor"]["mode"] == "mtp": - gpc.config.parallel.sequence_parallel = False - else: + # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim) + # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error. + if gpc.config.parallel["tensor"]["mode"] == "isp": gpc.config.parallel.sequence_parallel = True + else: + gpc.config.parallel.sequence_parallel = False yield finally: gpc.config.parallel.sequence_parallel = prev_mode @@ -102,7 +104,7 @@ def evaluate_on_val_dls( total_val_bsz = len(batch[1]) assert total_val_bsz % data_cfg.micro_bsz == 0 num_microbatches = total_val_bsz // data_cfg.micro_bsz - if gpc.config.parallel["tensor"]["mode"] == "isp": + if gpc.config.parallel.sequence_parallel: sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR) tensor_shape = torch.Size( [ From 0ec9b67aa629def980149d1a5a8ac9ff3d9231f1 Mon Sep 17 00:00:00 2001 From: JiaoPL Date: Thu, 25 Jan 2024 14:23:44 +0800 Subject: [PATCH 133/153] fix moe param groups --- .../solver/optimizer/hybrid_zero_optim.py | 36 ++++++++++++++----- internlm/solver/optimizer/utils.py | 18 ++-------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index d7ffb1c0..44111cb9 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -673,17 +673,21 @@ def _compute_param_norm_stage( grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] - if group_id == 0: + if self.optim.param_groups[group_id]["name"] in ("default", "fp32"): for param in params: if self.use_isp: setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) else: setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif group_id == 1: + elif self.optim.param_groups[group_id]["name"] == "embed_head": + # should be isp mode for param in params: setattr(param, IS_TENSOR_DATA_PARALLEL, True) + elif self._is_moe_group(self.optim.param_groups[group_id]): + for param in params: + setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) else: - raise NotImplementedError("group_id > 1 is not yet implemented.") + raise NotImplementedError("unrecognized parameter group.") if self._clip_grad_norm > 0: total_param_norms = compute_param_norm( @@ -704,6 +708,8 @@ def _compute_param_norm_stage( delattr(param, IS_TENSOR_ZERO_PARALLEL) if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): delattr(param, IS_WEIGHT_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL): + delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL) return total_param_norms @@ -718,17 +724,21 @@ def _compute_vocab_grad_norm_stage( grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] - if group_id == 0: + if self.optim.param_groups[group_id]["name"] in ("default", "fp32"): for param in params: if self.use_isp: setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) else: setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif group_id == 1: + elif self.optim.param_groups[group_id]["name"] == "embed_head": + # should be isp mode for param in params: setattr(param, IS_TENSOR_DATA_PARALLEL, True) + elif self._is_moe_group(self.optim.param_groups[group_id]): + for param in params: + setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) else: - raise NotImplementedError("group_id > 1 is not yet implemented.") + raise NotImplementedError("unrecognized parameter group.") vocab_grad_norm = None @@ -751,6 +761,8 @@ def _compute_vocab_grad_norm_stage( delattr(param, IS_TENSOR_ZERO_PARALLEL) if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): delattr(param, IS_WEIGHT_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL): + delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL) return vocab_grad_norm @@ -767,17 +779,21 @@ def _count_zero_grads_stage( grads = [self.padding_grad.to(dtype)] params = [self.padding_tensor.to(dtype)] - if group_id == 0: + if self.optim.param_groups[group_id]["name"] in ("default", "fp32"): for param in params: if self.use_isp: setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) else: setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif group_id == 1: + elif self.optim.param_groups[group_id]["name"] == "embed_head": + # should be isp mode for param in params: setattr(param, IS_TENSOR_DATA_PARALLEL, True) + elif self._is_moe_group(self.optim.param_groups[group_id]): + for param in params: + setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) else: - raise NotImplementedError("group_id > 1 is not yet implemented.") + raise NotImplementedError("unrecognized parameter group.") if self._clip_grad_norm > 0: total_zero_grad_count = compute_zero_grad_count( @@ -798,6 +814,8 @@ def _count_zero_grads_stage( delattr(param, IS_TENSOR_ZERO_PARALLEL) if hasattr(param, IS_WEIGHT_ZERO_PARALLEL): delattr(param, IS_WEIGHT_ZERO_PARALLEL) + if hasattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL): + delattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL) return total_zero_grad_count diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index d2769474..ff707a42 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -239,13 +239,6 @@ def reduce_grads(gradients, parameters, weight_parallel_mode, fine_grained=False if fine_grained: parallel_grads = {} - if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): - param_parallel_mode = ParallelMode.TENSOR - elif gpc.is_using_parallel_mode(weight_parallel_mode): - param_parallel_mode = weight_parallel_mode - else: - param_parallel_mode = ParallelMode.TENSOR - def append_grad(g, p): if fine_grained: param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding" @@ -255,7 +248,7 @@ def append_grad(g, p): elif only_output: param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding" if ( - gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(param_parallel_mode) + gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR) and gpc.config.model["hidden_size"] == g.shape[1] and "embedding" not in param_name.lower() ): @@ -451,13 +444,6 @@ def compute_vocab_grad_norm( norm_type = float(norm_type) vocab_size = gpc.config.model["vocab_size"] - if is_tensor_data_parallel_parameter(parameters[0]) or is_tensor_zero_parallel_parameter(parameters[0]): - param_parallel_mode = ParallelMode.TENSOR - elif gpc.is_using_parallel_mode(weight_parallel_mode): - param_parallel_mode = weight_parallel_mode - else: - param_parallel_mode = ParallelMode.TENSOR - param_grads = reduce_grads(gradients, parameters, weight_parallel_mode, only_output=True) vocab_grad_norm = torch.zeros((vocab_size,), dtype=torch.float32).to(get_current_device()) @@ -465,7 +451,7 @@ def compute_vocab_grad_norm( for grad in param_grads: # get grad norm of each vocab vocab_slice_size = grad.shape[0] - local_tp_rank = gpc.get_local_rank(param_parallel_mode) + local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) for i in range(vocab_slice_size): cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0] vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm( From aa388b54d01edc9046b1e18f8b20b9debdecb72b Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Thu, 25 Jan 2024 14:32:48 +0800 Subject: [PATCH 134/153] modify the distributedAttention for different data pack mode --- internlm/core/context/parallel_context.py | 1 + internlm/model/multi_head_attention.py | 65 +++++++++++++---------- internlm/utils/evaluation.py | 7 ++- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index fd53c4be..e1bdb601 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -157,6 +157,7 @@ def __init__(self): self.virtual_pipeline_parallel_size = None self.virtual_pipeline_parallel_rank = None self._expert_parallel_group_names = [] + self.evaluation = False @property def config(self): diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 825e3f21..01a88034 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -80,48 +80,57 @@ def __init__( self, local_attention: Module, sequence_process_group: dist.ProcessGroup, - first_scatter_idx: int = 2, - first_gather_idx: int = 0, - second_scatter_idx: int = 0, - second_gather_idx: int = 1, ) -> None: super().__init__() self.local_attn = local_attention self.spg = sequence_process_group - self.first_scatter_idx = first_scatter_idx - self.first_gather_idx = first_gather_idx - self.second_scatter_idx = second_scatter_idx - self.second_gather_idx = second_gather_idx + self._scatter_gather_idx = {} + + # scatter_gather_idx contains the scatter and gather index for different data packed mode + # key is the data packed mode, which should be in ['qkv', 'kv', 'q', 'output'] + # value is the scatter and gather index in all2all + self._scatter_gather_idx['qkv'] = [2, 0] # qkv shape:[sequence, 3, head, head_dim] + self._scatter_gather_idx['kv'] = [2, 0] # kv shape: [sequence, 2, head, head_dim] + self._scatter_gather_idx['q'] = [1, 0] # q/k/v shape: [sequence, head, head_dim] + self._scatter_gather_idx['output'] = [0, 1] # output shape: [sequence, head, head_dim] + + + def forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any) -> Tensor: + if gpc.evaluation is True: + # when conducting evaluation, the scatter and gather index should add 1. + eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()} + self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs) + else: + self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs) - def forward(self, qkv: Tensor, **kwargs: Any) -> Tensor: + def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, scatter_gather: dict = None, **kwargs: Any) -> Tensor: """forward Arguments: - query (Tensor): query input to the layer - key (Tensor): key input to the layer - value (Tensor): value input to the layer + qkv (Tensor): packed qkv input to the layer + kv (Tensor): packed kv input to the layer + q (Tensor): q input to the layer + k (Tensor): k input to the layer + v (Tensor): v input to the layer args: other args Returns: * output (Tensor): context output """ - # Evaluation - if qkv.ndim == 5: - # in shape: [batch, seq/tp_size, 3, head, head_dim] - qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx + 1, self.first_gather_idx + 1) - # out shape : [batch, seq, head/tp_size, head_dim] - context_layer = self.local_attn(qkv, **kwargs) - # in shape: [batch, seq, head/tp_size, head_dim] - output = _SeqAllToAll.apply( - self.spg, context_layer, self.second_scatter_idx + 1, self.second_gather_idx + 1 - ) - else: # training - # in shape: [seq/tp_size, 3, head, head_dim] - qkv = _SeqAllToAll.apply(self.spg, qkv, self.first_scatter_idx, self.first_gather_idx) - # out shape : [seq, head/tp_size, head_dim] + + if qkv is not None: + qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather['qkv'][0], scatter_gather['qkv'][1]) context_layer = self.local_attn(qkv, **kwargs) - # in shape: [seq, head/tp_size, head_dim] - output = _SeqAllToAll.apply(self.spg, context_layer, self.second_scatter_idx, self.second_gather_idx) + elif kv is not None: + q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1]) + kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather['kv'][0], scatter_gather['kv'][1]) + context_layer = self.local_attn(q, kv, **kwargs) + else: + q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1]) + k = _SeqAllToAll.apply(self.spg, k, scatter_gather['q'][0], scatter_gather['q'][1]) + v = _SeqAllToAll.apply(self.spg, v, scatter_gather['q'][0], scatter_gather['q'][1]) + context_layer = self.local_attn(q, k, v, **kwargs) + output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather['output'][0], scatter_gather['output'][1]) # out e.g., [s/p::h] return output diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 1c1515b4..1d840ac4 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -47,8 +47,9 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape @contextmanager -def switch_sequence_parallel_mode(): +def switch_evaluation_mode(): prev_mode = gpc.config.parallel.sequence_parallel + prev_evaluation = gpc.evaluation try: # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim) # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error. @@ -56,9 +57,11 @@ def switch_sequence_parallel_mode(): gpc.config.parallel.sequence_parallel = True else: gpc.config.parallel.sequence_parallel = False + gpc.evaluation = True yield finally: gpc.config.parallel.sequence_parallel = prev_mode + gpc.evaluation = prev_evaluation def evaluate_on_val_dls( @@ -70,7 +73,7 @@ def evaluate_on_val_dls( update_panel: bool = False, streaming: bool = False, ): - with switch_sequence_parallel_mode(): + with switch_evaluation_mode(): torch.cuda.empty_cache() trainer.eval() verbose = gpc.is_rank_for_log() From 34b94790b08ca8e1260a398366cf44bfbb891318 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 25 Jan 2024 14:48:54 +0800 Subject: [PATCH 135/153] feat(model/multi_head_attention.py): fix return output --- internlm/model/multi_head_attention.py | 46 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 01a88034..200d4a9f 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -85,25 +85,35 @@ def __init__( self.local_attn = local_attention self.spg = sequence_process_group self._scatter_gather_idx = {} - + # scatter_gather_idx contains the scatter and gather index for different data packed mode # key is the data packed mode, which should be in ['qkv', 'kv', 'q', 'output'] # value is the scatter and gather index in all2all - self._scatter_gather_idx['qkv'] = [2, 0] # qkv shape:[sequence, 3, head, head_dim] - self._scatter_gather_idx['kv'] = [2, 0] # kv shape: [sequence, 2, head, head_dim] - self._scatter_gather_idx['q'] = [1, 0] # q/k/v shape: [sequence, head, head_dim] - self._scatter_gather_idx['output'] = [0, 1] # output shape: [sequence, head, head_dim] - - - def forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any) -> Tensor: + self._scatter_gather_idx["qkv"] = [2, 0] # qkv shape:[sequence, 3, head, head_dim] + self._scatter_gather_idx["kv"] = [2, 0] # kv shape: [sequence, 2, head, head_dim] + self._scatter_gather_idx["q"] = [1, 0] # q/k/v shape: [sequence, head, head_dim] + self._scatter_gather_idx["output"] = [0, 1] # output shape: [sequence, head, head_dim] + + def forward( + self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any + ) -> Tensor: if gpc.evaluation is True: # when conducting evaluation, the scatter and gather index should add 1. eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()} - self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs) + return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs) else: - self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs) + return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=self._scatter_gather_idx, **kwargs) - def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, scatter_gather: dict = None, **kwargs: Any) -> Tensor: + def _forward( + self, + qkv: Tensor = None, + kv: Tensor = None, + q: Tensor = None, + k: Tensor = None, + v: Tensor = None, + scatter_gather: dict = None, + **kwargs: Any, + ) -> Tensor: """forward Arguments: @@ -119,18 +129,18 @@ def _forward(self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: T """ if qkv is not None: - qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather['qkv'][0], scatter_gather['qkv'][1]) + qkv = _SeqAllToAll.apply(self.spg, qkv, scatter_gather["qkv"][0], scatter_gather["qkv"][1]) context_layer = self.local_attn(qkv, **kwargs) elif kv is not None: - q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1]) - kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather['kv'][0], scatter_gather['kv'][1]) + q = _SeqAllToAll.apply(self.spg, q, scatter_gather["q"][0], scatter_gather["q"][1]) + kv = _SeqAllToAll.apply(self.spg, kv, scatter_gather["kv"][0], scatter_gather["kv"][1]) context_layer = self.local_attn(q, kv, **kwargs) else: - q = _SeqAllToAll.apply(self.spg, q, scatter_gather['q'][0], scatter_gather['q'][1]) - k = _SeqAllToAll.apply(self.spg, k, scatter_gather['q'][0], scatter_gather['q'][1]) - v = _SeqAllToAll.apply(self.spg, v, scatter_gather['q'][0], scatter_gather['q'][1]) + q = _SeqAllToAll.apply(self.spg, q, scatter_gather["q"][0], scatter_gather["q"][1]) + k = _SeqAllToAll.apply(self.spg, k, scatter_gather["q"][0], scatter_gather["q"][1]) + v = _SeqAllToAll.apply(self.spg, v, scatter_gather["q"][0], scatter_gather["q"][1]) context_layer = self.local_attn(q, k, v, **kwargs) - output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather['output'][0], scatter_gather['output'][1]) + output = _SeqAllToAll.apply(self.spg, context_layer, scatter_gather["output"][0], scatter_gather["output"][1]) # out e.g., [s/p::h] return output From 10309b8b8b358d6c77320d4e9fe603d41ea8e66b Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 25 Jan 2024 15:46:49 +0800 Subject: [PATCH 136/153] feat(utils/evaluation.py): rename gpc.evaluation to gpc.is_evaluating --- internlm/core/context/parallel_context.py | 2 +- internlm/utils/evaluation.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index e1bdb601..d597575c 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -157,7 +157,7 @@ def __init__(self): self.virtual_pipeline_parallel_size = None self.virtual_pipeline_parallel_rank = None self._expert_parallel_group_names = [] - self.evaluation = False + self.is_evaluating = False @property def config(self): diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py index 1d840ac4..a10cf243 100644 --- a/internlm/utils/evaluation.py +++ b/internlm/utils/evaluation.py @@ -48,20 +48,22 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape @contextmanager def switch_evaluation_mode(): - prev_mode = gpc.config.parallel.sequence_parallel - prev_evaluation = gpc.evaluation + prev_seq = gpc.config.parallel.sequence_parallel + prev_eval = gpc.is_evaluating try: + gpc.is_evaluating = True + # when training x.shape is torch.Size([1024, 4096]), linear all gather in dim=0(sequence dim) # but evaluation x.shape is torch.Size([1, 1024, 4096]), gather in dim=0 is error. if gpc.config.parallel["tensor"]["mode"] == "isp": gpc.config.parallel.sequence_parallel = True else: gpc.config.parallel.sequence_parallel = False - gpc.evaluation = True + yield finally: - gpc.config.parallel.sequence_parallel = prev_mode - gpc.evaluation = prev_evaluation + gpc.config.parallel.sequence_parallel = prev_seq + gpc.is_evaluating = prev_eval def evaluate_on_val_dls( From 4c8324a5b95a517d90505926289b91677c208273 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 25 Jan 2024 15:55:39 +0800 Subject: [PATCH 137/153] feat(multi_head_attention.py): rename gpc.evaluation to gpc.is_evaluating --- internlm/model/multi_head_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py index 200d4a9f..2d3a5959 100644 --- a/internlm/model/multi_head_attention.py +++ b/internlm/model/multi_head_attention.py @@ -97,7 +97,7 @@ def __init__( def forward( self, qkv: Tensor = None, kv: Tensor = None, q: Tensor = None, k: Tensor = None, v: Tensor = None, **kwargs: Any ) -> Tensor: - if gpc.evaluation is True: + if gpc.is_evaluating is True: # when conducting evaluation, the scatter and gather index should add 1. eval_scatter_gather_idx = {key: [x + 1 for x in value] for key, value in self._scatter_gather_idx.items()} return self._forward(qkv=qkv, kv=kv, q=q, k=k, v=v, scatter_gather=eval_scatter_gather_idx, **kwargs) From f186a7548ae9e2919d0c620331dc67c336ef43cb Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Thu, 25 Jan 2024 18:32:46 +0800 Subject: [PATCH 138/153] feat(communication/isp.py): refactor isp communicator to adapt to different model structures --- internlm/core/communication/isp.py | 46 +++++++++++------------------ internlm/train/training_internlm.py | 3 -- 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index d0bbe2ed..fd917b77 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -24,11 +24,9 @@ class ISPCommModelConfig: model config for isp communicator. """ - hidden_size: int = 0 - mlp_ratio: float = 0 dtype: torch.dtype = torch.half device: torch.device = torch.device("cuda") - modules: List[str] = None + module_shapes: Dict[str, torch.Size] = None class MemoryPool: @@ -41,11 +39,9 @@ def __init__( model_conf: ISPCommModelConfig, with_bias: bool = False, ) -> None: - self._hidden_size = model_conf.hidden_size - self._mlp_ratio = model_conf.mlp_ratio self._dtype = model_conf.dtype self._device = model_conf.device - self._module_shapes = self._init_module_shape(model_conf.modules) + self._module_shapes = model_conf.module_shapes # due to intern sequence parallel communication overlap, we need # **two** memory pools for current block weights and the next block weights. @@ -75,21 +71,6 @@ def __init__( # memory pool for constant zero tensors, allocated lazily. self._zero_const_pool = {} - def _init_module_shape(self, modules: List[str]) -> Dict[str, torch.Size]: - mlp_hidden_size = 256 * ((int(self._hidden_size * self._mlp_ratio) + 256 - 1) // 256) - - # TODO: the memory pool should be more generic. - # Currently, it only supports llama-class models with specific naming structure. - static_shapes = { - "Wqkv": torch.Size((3 * self._hidden_size, self._hidden_size)), - "out_proj": torch.Size((self._hidden_size, self._hidden_size)), - "w1": torch.Size((mlp_hidden_size, self._hidden_size)), - "w2": torch.Size((mlp_hidden_size, self._hidden_size)), - "w3": torch.Size((self._hidden_size, mlp_hidden_size)), - } - - return {name: static_shapes[name] for name in modules} - def allocate_constant_zero(self, size: tuple) -> torch.Tensor: if size not in self._zero_const_pool: self._zero_const_pool[size] = torch.zeros(*size, dtype=self._dtype, device=self._device).contiguous() @@ -180,9 +161,9 @@ def __init__( self.overlap = overlap self.enable_memory_pool = overlap and enable_memory_pool self.model_conf = model_conf - self.module_name = model_conf.modules.copy() self.is_forward = True self.reduce_scatter_handlers = {} + self._module_shapes = {} # real overlap state for each chunk. self._overlap_states: Dict[int, ISPOverlapState] = {} @@ -207,12 +188,6 @@ def __init__( # key: transformer block index; value: isp modules self._index_to_isp_module = None - # init memory pool if necessary. - if self.enable_memory_pool: - self.memory_pool = MemoryPool(model_conf, with_bias=True) - else: - self.memory_pool = None - # init overlap states if necessary. if self.overlap: # just want to share same for loop for modulelist and module. @@ -228,6 +203,13 @@ def __init__( self._register_sync_parameters_hook() # switch to chunk 0 at first. self.switch_current_model_chunk(0) + self.model_conf.module_shapes = self._module_shapes + + # init memory pool if necessary. + if self.enable_memory_pool: + self.memory_pool = MemoryPool(self.model_conf, with_bias=True) + else: + self.memory_pool = None def _parse_model_structure(self, cid: int, model: nn.Module) -> None: self._overlap_states[cid] = ISPOverlapState() @@ -246,10 +228,16 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None: self._overlap_states[cid].index_to_isp_module[idx] = [] for sub_name, sub in block.named_children(): for name, child in sub.named_children(): - if name == "out_proj": + if name in ["out_proj", "wo"]: self._overlap_states[cid].isp_outs.append(child) self._overlap_states[cid].module_to_index[child] = idx if isinstance(child, ISPLinear): + if name not in self._module_shapes: + origin_shape = tuple( + [child.weight.shape[0] * gpc.weight_parallel_size] + + list(child.weight.shape[1:]) + ) + self._module_shapes[name] = torch.Size(origin_shape) self._overlap_states[cid].module_to_index[child] = idx self._overlap_states[cid].isp_modules.append(child) self._overlap_states[cid].index_to_isp_module[idx].append(child) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index c13abfc3..b43cfcb3 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -222,11 +222,8 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f isp_communicator = ISPCommunicator( model, ISPCommModelConfig( - gpc.config.model.hidden_size, - gpc.config.model.mlp_ratio, gpc.config.model.dtype, get_current_device(), - ["Wqkv", "out_proj", "w1", "w2", "w3"], ), gpc.config.parallel.weight.overlap, gpc.config.model.checkpoint, From 3d7402d59ddc3eeb451fadb8cab751c87a953436 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 26 Jan 2024 11:56:47 +0800 Subject: [PATCH 139/153] fix(tests): fix ci test error --- internlm/train/__init__.py | 4 ++++ tests/test_core/utils.py | 4 +++- tests/test_training/test_load_ckpt_loss.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py index e4f049d7..9a70e1e2 100644 --- a/internlm/train/__init__.py +++ b/internlm/train/__init__.py @@ -7,6 +7,8 @@ initialize_optimizer, load_new_batch, record_current_batch_training_metrics, + set_fp32_attr_for_model, + set_parallel_attr_for_param_groups, wrap_FSDP_model, ) @@ -20,4 +22,6 @@ "record_current_batch_training_metrics", "wrap_FSDP_model", "get_scheduler_hooks", + "set_parallel_attr_for_param_groups", + "set_fp32_attr_for_model", ] diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py index 3d25667f..f7d562e2 100644 --- a/tests/test_core/utils.py +++ b/tests/test_core/utils.py @@ -13,7 +13,7 @@ from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler from internlm.model.metrics import SchedulerMetricHook from internlm.solver.pipeline_utils import partition_uniform -from internlm.train import initialize_optimizer +from internlm.train import initialize_optimizer, set_parallel_attr_for_param_groups class MlpModel(nn.Module): @@ -67,6 +67,8 @@ def init_model_and_optim( pp_model = _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, embedding=embedding) pp_model = pp_model.to(dtype) + set_parallel_attr_for_param_groups(pp_model) + # pp scheduler scheduler_hooks = [ SchedulerMetricHook(skip=True), diff --git a/tests/test_training/test_load_ckpt_loss.py b/tests/test_training/test_load_ckpt_loss.py index a5156870..01a11299 100644 --- a/tests/test_training/test_load_ckpt_loss.py +++ b/tests/test_training/test_load_ckpt_loss.py @@ -12,7 +12,7 @@ from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.context.parallel_context import Config -from internlm.core.scheduler import SchedulerMetricHook +from internlm.model.metrics import SchedulerMetricHook from internlm.core.trainer import TrainState from internlm.initialize.launch import args_sanity_check from internlm.model.loss import FlashGPTLMLoss From 8170641936ecb791179f33c34d2a5a50b1cfc71a Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 26 Jan 2024 15:28:05 +0800 Subject: [PATCH 140/153] fix(tests): fix ci pipeline test error --- tests/test_core/test_pipeline.py | 2 +- tests/test_core/utils.py | 4 +--- tests/test_utils/test_model_checkpoint.py | 2 ++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py index db7b3ddd..f5b4ebb3 100644 --- a/tests/test_core/test_pipeline.py +++ b/tests/test_core/test_pipeline.py @@ -118,7 +118,7 @@ def exam_pipeline_parallel(args): ) output_list.append(output) - engine.step() + # engine.step() # torch related if gpc.is_last_rank(ParallelMode.PIPELINE): diff --git a/tests/test_core/utils.py b/tests/test_core/utils.py index f7d562e2..3d25667f 100644 --- a/tests/test_core/utils.py +++ b/tests/test_core/utils.py @@ -13,7 +13,7 @@ from internlm.core.scheduler import InterleavedPipelineScheduler, NonPipelineScheduler, PipelineScheduler from internlm.model.metrics import SchedulerMetricHook from internlm.solver.pipeline_utils import partition_uniform -from internlm.train import initialize_optimizer, set_parallel_attr_for_param_groups +from internlm.train import initialize_optimizer class MlpModel(nn.Module): @@ -67,8 +67,6 @@ def init_model_and_optim( pp_model = _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, embedding=embedding) pp_model = pp_model.to(dtype) - set_parallel_attr_for_param_groups(pp_model) - # pp scheduler scheduler_hooks = [ SchedulerMetricHook(skip=True), diff --git a/tests/test_utils/test_model_checkpoint.py b/tests/test_utils/test_model_checkpoint.py index 2063591c..c649d251 100644 --- a/tests/test_utils/test_model_checkpoint.py +++ b/tests/test_utils/test_model_checkpoint.py @@ -16,6 +16,8 @@ LOCAL_SAVE_PATH, del_tmp_file, init_config, + init_dist_and_model, + reset_singletons, ) # (TOTAL_STEP, CKPT_EVERY, SNPASHOT_EVERY) From 85dd51fb73173369f3715b8b615cf1fc6b14a042 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Fri, 26 Jan 2024 17:55:09 +0800 Subject: [PATCH 141/153] feat(utils/common.py): remove func get_megatron_flops_2 --- internlm/train/training_internlm.py | 3 --- internlm/utils/common.py | 37 ----------------------------- internlm/utils/gputest.py | 17 +++++-------- train.py | 14 ----------- 4 files changed, 6 insertions(+), 65 deletions(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 60f9f821..2ca66be5 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -538,7 +538,6 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): @llm_timeout(func_name="record_current_batch_training_metrics") def record_current_batch_training_metrics( get_tflops_func, - get_tflops_func_2, logger, writer, success_update, @@ -623,7 +622,6 @@ def record_current_batch_training_metrics( tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2) tflops = get_tflops_func((time.time() - start_time)) - tflops_2 = get_tflops_func_2((time.time() - start_time)) tgs_origin = round( num_tokens_in_batch @@ -635,7 +633,6 @@ def record_current_batch_training_metrics( infos = { "tflops": tflops, - "tflops2": tflops_2, "step": batch_count, "loss": loss.item() - moe_loss.item() if moe_loss is not None else loss.item(), "tgs (tokens/gpu/second)": tgs_origin, diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 7ef57278..39e2d902 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -232,43 +232,6 @@ def get_megatron_flops( return tflops -def get_megatron_flops_2( - elapsed_time_per_iter, - checkpoint=False, - seq_len=2048, - hidden_size=12, - num_layers=32, - vocab_size=12, - global_batch_size=4, - global_world_size=1, - mlp_ratio=4, - use_swiglu=True, -): - """ - Calc flops based on the paper of Megatron https://deepakn94.github.io/assets/papers/megatron-sc21.pdf - """ - - checkpoint_activations_factor = 4 if checkpoint else 3 - flashattn_activations_factor = 4.5 if checkpoint else 3.5 - - if use_swiglu: - mlp_ratio = mlp_ratio * 3 / 2 - - flops_per_iteration = ( - checkpoint_activations_factor - * (8 + mlp_ratio * 4) - * global_batch_size - * seq_len - * hidden_size**2 - * num_layers - + 4 * global_batch_size * seq_len**2 * hidden_size * num_layers * flashattn_activations_factor - + 6 * global_batch_size * seq_len * hidden_size * vocab_size - ) - - tflops = flops_per_iteration / (elapsed_time_per_iter * global_world_size * (10**12)) - return tflops - - class DummyProfile: """ Dummy Profile. diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 39c7341e..9224118a 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -43,17 +43,12 @@ def empty_cache_and_diag(batch_count, interval=50): if batch_count > 0: if gpc.is_rank_for_log(): logger.info("Empty Cache and Diagnosis GPU/NCCL/Timer ...") - # with torch.no_grad(): - # try: - # timer_diagnosis() - # bench_gpu() - # bench_net() - # except torch.distributed.DistBackendError as e: - # # import time - # # time.sleep(10) - # print(e, "rank = ", gpc.get_global_rank(), flush=True) - # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle") - + with torch.no_grad(): + timer_diagnosis() + bench_gpu() + # FIXME: Runtime benchmark diagnosis can easily cause the training process + # to exit due to NCCL errors. + # bench_net() # do empty_cache after the bench torch.cuda.empty_cache() # do garbage collection diff --git a/train.py b/train.py index 720a88f9..9620268d 100644 --- a/train.py +++ b/train.py @@ -31,7 +31,6 @@ from internlm.utils.common import ( BatchSkipper, get_megatron_flops, - get_megatron_flops_2, launch_time, parse_args, ) @@ -87,18 +86,6 @@ def main(args): mlp_ratio=gpc.config.model["mlp_ratio"], ) - get_tflops_func_2 = partial( - get_megatron_flops_2, - checkpoint=gpc.config.model.checkpoint, - seq_len=gpc.config.SEQ_LEN, - hidden_size=gpc.config.model.hidden_size, - num_layers=gpc.config.model.num_layers, - vocab_size=gpc.config.model.vocab_size, - global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA), - global_world_size=gpc.get_world_size(ParallelMode.GLOBAL), - mlp_ratio=gpc.config.MLP_RATIO, - ) - # get and broadcast current time current_time = launch_time() objs = [current_time] @@ -265,7 +252,6 @@ def main(args): # calculate and record the training metrics, eg. loss, accuracy and so on. record_current_batch_training_metrics( get_tflops_func=get_tflops_func, - get_tflops_func_2=get_tflops_func_2, logger=logger, writer=writer, success_update=success_update, From 971c8eb7f19a4b7378d9b72c314732031212289a Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 29 Jan 2024 15:22:46 +0800 Subject: [PATCH 142/153] feat(communication/isp.py): isp communicator support 0.x activation ckpt --- internlm/core/communication/isp.py | 46 ++++++++++++++++------------- internlm/train/training_internlm.py | 2 +- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index fd917b77..47e37842 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -26,6 +26,7 @@ class ISPCommModelConfig: dtype: torch.dtype = torch.half device: torch.device = torch.device("cuda") + activation_checkpointing: float = 0.0 module_shapes: Dict[str, torch.Size] = None @@ -131,7 +132,8 @@ def __init__(self) -> None: self.num_blocks: int = 0 self.embedding: List[nn.Module] = [] self.head: List[nn.Module] = [] - self.last_block: nn.Moudle = None + self.ckpt_block_num: int = 0 + self.last_ckpt_block: nn.Module = None self.isp_outs: List[nn.Module] = [] self.isp_modules: List[nn.Module] = [] self.index_to_isp_module: Dict[int, nn.Module] = {} @@ -152,12 +154,10 @@ def __init__( model: Union[nn.Module, nn.ModuleList], model_conf: ISPCommModelConfig, overlap: bool = False, - activation_checkpointing: bool = False, enable_memory_pool: bool = False, process_group: dist.ProcessGroup = None, ) -> None: self.process_group = process_group - self.model_checkpoint = activation_checkpointing self.overlap = overlap self.enable_memory_pool = overlap and enable_memory_pool self.model_conf = model_conf @@ -172,7 +172,8 @@ def __init__( self._num_blocks = None self._head = None self._embedding = None - self._last_block = None + self._ckpt_block_num = None + self._last_ckpt_block = None self._isp_outs = None self._isp_modules = None # key: isp module; value: module global all-gather op handle @@ -222,7 +223,10 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None: elif isinstance(children, Embedding1D): self._overlap_states[cid].embedding.append(children) elif isinstance(children, nn.ModuleList): - self._overlap_states[cid].last_block = children[-1] + self._overlap_states[cid].ckpt_block_num = int(self.model_conf.activation_checkpointing * len(children)) + self._overlap_states[cid].last_ckpt_block = children[ + max(0, self._overlap_states[cid].ckpt_block_num - 1) + ] for idx, block in enumerate(children): self._overlap_states[cid].index_to_isp_module[idx] = [] @@ -335,7 +339,7 @@ def _post_forward_hook_for_embedding(self, *args): # pylint: disable=W0613 def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args): # pylint: disable=W0613 block_index = self._module_to_index[module] - if self.model_checkpoint and self.is_forward is False: + if (block_index - 1 < self._ckpt_block_num) and self.is_forward is False: if block_index - 1 >= 0: self._all_gather_block_weight(block_index - 1) else: @@ -350,13 +354,13 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args): # pylint: dis self._wait_handle(module) def _pre_forward_hook_for_block(self, *args): # pylint: disable=W0613 - for module in self._index_to_isp_module[self._num_blocks - 1]: + for module in self._index_to_isp_module[self._ckpt_block_num - 1]: self._all_gather_module_weight(module) - self._wait_handle(module) + # self._wait_handle(module) def _post_forward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 self._clear_handle(module) - if not (self.model_checkpoint and self.is_forward is False): + if not ((self._module_to_index[module] < self._ckpt_block_num) and self.is_forward is False): self._clear_weight(module) def _post_backward_hook_for_head(self, *args): # pylint: disable=W0613 @@ -377,7 +381,8 @@ def _pre_backward_hook_for_module(self, module: nn.Module, *args): # pylint: di module_index = self._isp_modules.index(module) if module_index - 1 >= 0: next_module = self._isp_modules[module_index - 1] - self._all_gather_module_weight(next_module) + if self._module_to_index[next_module] >= self._ckpt_block_num: + self._all_gather_module_weight(next_module) def _post_backward_hook_for_module(self, module, *args): # pylint: disable=W0613 self._clear_handle(module) @@ -396,12 +401,12 @@ def _register_sync_parameters_hook(self) -> None: for embedding in self._embedding: embedding.register_forward_hook(self._post_forward_hook_for_embedding) - if self.model_checkpoint: - if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): - for head in self._head: - head.register_full_backward_pre_hook(self._pre_backward_hook_for_head) - else: - self._last_block.register_forward_pre_hook(self._pre_forward_hook_for_block) + if self._ckpt_block_num >= 1: + # if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): + # for head in self._head: + # head.register_full_backward_pre_hook(self._pre_backward_hook_for_head) + # else: + self._last_ckpt_block.register_forward_pre_hook(self._pre_forward_hook_for_block) for out_proj in self._isp_outs: out_proj.register_forward_pre_hook(self._pre_forward_hook_for_out_proj) @@ -414,7 +419,7 @@ def _register_sync_parameters_hook(self) -> None: # 1. register post_backward_hook @head module to prefetch for the last block's last module # 2. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module # 3. register post_backward_hook @isp_module to release resource - if not self.model_checkpoint: + if self._ckpt_block_num < self._num_blocks: for head in self._head: head.register_full_backward_hook(self._post_backward_hook_for_head) @@ -443,7 +448,8 @@ def switch_current_model_chunk(self, chunk_id: int) -> None: self._bias_global_output = self._overlap_states[chunk_id].bias_global_output self._module_to_index = self._overlap_states[chunk_id].module_to_index self._index_to_isp_module = self._overlap_states[chunk_id].index_to_isp_module - self._last_block = self._overlap_states[chunk_id].last_block + self._ckpt_block_num = self._overlap_states[chunk_id].ckpt_block_num + self._last_ckpt_block = self._overlap_states[chunk_id].last_ckpt_block self._head = self._overlap_states[chunk_id].head self._embedding = self._overlap_states[chunk_id].embedding self._num_blocks = self._overlap_states[chunk_id].num_blocks @@ -514,7 +520,7 @@ def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None: self._zero_optim = zero_optim def before_forward(self, scheduler, inputs) -> None: - if self._isp_communicator.model_checkpoint: + if self._isp_communicator._ckpt_block_num > 0: self._isp_communicator.is_forward = True # switch model chunk before forward chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank @@ -530,7 +536,7 @@ def after_criterion(self, scheduler, loss) -> None: pass def before_backward(self, scheduler, outputs, outputs_grad) -> None: - if self._isp_communicator.model_checkpoint: + if self._isp_communicator._ckpt_block_num > 0: self._isp_communicator.is_forward = False # switch model chunk before backward chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2ca66be5..7924da69 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -224,9 +224,9 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f ISPCommModelConfig( gpc.config.model.dtype, get_current_device(), + gpc.config.model.checkpoint, ), gpc.config.parallel.weight.overlap, - gpc.config.model.checkpoint, gpc.config.parallel.weight.memory_pool, gpc.get_group(ParallelMode.WEIGHT), ) From 6853babe1e1bf8eaf93cd5a11c8d75ce92cb464c Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 29 Jan 2024 15:55:58 +0800 Subject: [PATCH 143/153] feat(train/training_internlm.py): move isp init to func initialize_isp_communicator --- internlm/train/__init__.py | 2 ++ internlm/train/training_internlm.py | 49 ++++++++++++++++++----------- train.py | 6 +++- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/internlm/train/__init__.py b/internlm/train/__init__.py index 9a70e1e2..d44eaec9 100644 --- a/internlm/train/__init__.py +++ b/internlm/train/__init__.py @@ -2,6 +2,7 @@ get_scheduler_hooks, get_train_data_loader, get_validation_data_loader, + initialize_isp_communicator, initialize_llm_profile, initialize_model, initialize_optimizer, @@ -17,6 +18,7 @@ "get_validation_data_loader", "initialize_llm_profile", "initialize_model", + "initialize_isp_communicator", "initialize_optimizer", "load_new_batch", "record_current_batch_training_metrics", diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 7924da69..62a9d060 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -216,24 +216,7 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f # if fsdp enabled, wrap the model model = wrap_FSDP_model(model) - if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": - isp_communicator = None - else: - isp_communicator = ISPCommunicator( - model, - ISPCommModelConfig( - gpc.config.model.dtype, - get_current_device(), - gpc.config.model.checkpoint, - ), - gpc.config.parallel.weight.overlap, - gpc.config.parallel.weight.memory_pool, - gpc.get_group(ParallelMode.WEIGHT), - ) - # register communicator for isp linear. - ISPLinear.register_communicator(isp_communicator) - - return model, isp_communicator + return model def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): @@ -269,6 +252,36 @@ def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]): return model +def initialize_isp_communicator(model: Union[nn.Module, nn.ModuleList]): + """ + Initialize communicator for isp tensor parallel mode. + + Args: + model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated. + + Returns: + An isp communicator for managing comp/comm overlap and memory pool. + """ + if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": + isp_communicator = None + else: + isp_communicator = ISPCommunicator( + model, + ISPCommModelConfig( + gpc.config.model.dtype, + get_current_device(), + gpc.config.model.checkpoint, + ), + gpc.config.parallel.weight.overlap, + gpc.config.parallel.weight.memory_pool, + gpc.get_group(ParallelMode.WEIGHT), + ) + # register communicator for isp linear. + ISPLinear.register_communicator(isp_communicator) + + return isp_communicator + + @llm_timeout(func_name="initialize_optimizer") def initialize_optimizer(model: Union[nn.Module, nn.ModuleList], isp_communicator: ISPCommunicator = None): """ diff --git a/train.py b/train.py index 9620268d..150f5463 100644 --- a/train.py +++ b/train.py @@ -22,6 +22,7 @@ get_scheduler_hooks, get_train_data_loader, get_validation_data_loader, + initialize_isp_communicator, initialize_llm_profile, initialize_model, initialize_optimizer, @@ -96,7 +97,10 @@ def main(args): uniscale_logger = initialize_llm_logger(start_time=current_time) # initialize model - model, isp_communicator = initialize_model() + model = initialize_model() + + # initialize isp communicator + isp_communicator = initialize_isp_communicator(model) with open(args.config, "r") as f: config_lines = f.readlines() From 8c45118c1c8d6217f759f61743e4475204819101 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 29 Jan 2024 16:07:28 +0800 Subject: [PATCH 144/153] feat(communication/isp.py): fix prefetch last ckpt block wait handle --- internlm/core/communication/isp.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/internlm/core/communication/isp.py b/internlm/core/communication/isp.py index 47e37842..bd744d10 100644 --- a/internlm/core/communication/isp.py +++ b/internlm/core/communication/isp.py @@ -9,7 +9,6 @@ from torch import distributed as dist from torch import nn -from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc from internlm.core.naive_amp import NaiveAMPModel from internlm.model.embedding import Embedding1D @@ -356,7 +355,6 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args): # pylint: dis def _pre_forward_hook_for_block(self, *args): # pylint: disable=W0613 for module in self._index_to_isp_module[self._ckpt_block_num - 1]: self._all_gather_module_weight(module) - # self._wait_handle(module) def _post_forward_hook_for_module(self, module: nn.Module, *args): # pylint: disable=W0613 self._clear_handle(module) @@ -402,10 +400,6 @@ def _register_sync_parameters_hook(self) -> None: embedding.register_forward_hook(self._post_forward_hook_for_embedding) if self._ckpt_block_num >= 1: - # if gpc.is_last_rank(parallel_mode=ParallelMode.PIPELINE): - # for head in self._head: - # head.register_full_backward_pre_hook(self._pre_backward_hook_for_head) - # else: self._last_ckpt_block.register_forward_pre_hook(self._pre_forward_hook_for_block) for out_proj in self._isp_outs: From 011edcf27e3a5a020d9295e914e0542a8047debf Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 29 Jan 2024 16:50:33 +0800 Subject: [PATCH 145/153] feat(utils/parallel.py): add func is_using_isp --- .../solver/optimizer/hybrid_zero_optim.py | 6 ++---- internlm/solver/optimizer/utils.py | 5 ++--- internlm/train/training_internlm.py | 20 ++++++++----------- internlm/train/utils.py | 4 ++-- internlm/utils/model_checkpoint.py | 17 ++++++++-------- internlm/utils/parallel.py | 18 ++++++++--------- 6 files changed, 32 insertions(+), 38 deletions(-) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 44111cb9..d603539b 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -38,6 +38,7 @@ from internlm.utils.common import get_current_device from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer +from internlm.utils.parallel import is_using_isp from internlm.utils.timeout import llm_timeout from .base_optimizer import BaseOptimizer @@ -85,10 +86,7 @@ def __init__( clip_grad_norm = zero_cfg.clip_grad_norm self._overlap_sync_grad = zero_cfg.overlap_sync_grad self._overlap_sync_param = zero_cfg.overlap_sync_param - self.use_isp = ( - isinstance(gpc.config.parallel["tensor"], dict) - and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" - ) + self.use_isp = is_using_isp() super().__init__(optim=optimizer) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index ff707a42..ffa06477 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -22,6 +22,7 @@ is_tensor_data_parallel_parameter, is_tensor_expert_data_parallel_parameter, is_tensor_zero_parallel_parameter, + is_using_isp, is_weight_zero_parallel_parameter, ) @@ -312,9 +313,7 @@ def compute_norm( Total norm of the parameters, need total_norm**(1/norm) before using. """ - weight_parallel_mode = ( - ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR - ) + weight_parallel_mode = ParallelMode.WEIGHT if is_using_isp() else ParallelMode.TENSOR enable_cuda_kernels = gradients[0].device.type == "cuda" # Norm parameters. norm_type = float(norm_type) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 62a9d060..2fe61b7d 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -78,6 +78,7 @@ from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( + is_using_isp, is_replica_zero_parallel_parameter, is_tensor_data_parallel_parameter, is_tensor_expert_data_parallel_parameter, @@ -105,8 +106,6 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]): def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]): - tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp") - def _check_module(module): # layer_norm if isinstance(module, (RMSNorm, nn.LayerNorm)): @@ -120,9 +119,9 @@ def _check_module(module): # embedding and head if isinstance(module, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)): for param in module.parameters(): - if gpc.is_initialized(ParallelMode.TENSOR) and tp_mode == "isp": + if gpc.is_initialized(ParallelMode.TENSOR) and is_using_isp(): setattr(param, IS_TENSOR_DATA_PARALLEL, True) - elif gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp": + elif gpc.is_initialized(ParallelMode.TENSOR) and not is_using_isp(): setattr(param, IS_TENSOR_ZERO_PARALLEL, True) # for linear module @@ -131,9 +130,9 @@ def _check_module(module): if gpc.is_initialized(ParallelMode.EXPERT_DATA) and is_moe_param(param): # module should be MoE experts's linear setattr(param, IS_TENSOR_EXPERT_DATA_PARALLEL, True) - elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.TENSOR) and tp_mode != "isp": + elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.TENSOR) and not is_using_isp(): setattr(param, IS_TENSOR_ZERO_PARALLEL, True) - elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.WEIGHT) and tp_mode == "isp": + elif not is_moe_param(param) and gpc.is_initialized(ParallelMode.WEIGHT) and is_using_isp(): setattr(param, IS_WEIGHT_ZERO_PARALLEL, True) if not isinstance(model, nn.ModuleList): @@ -208,9 +207,7 @@ def initialize_model(pre_process_func: Optional[Callable] = None, post_process_f # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random # state in the same dp group are all the same. - random_mode = ( - ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA - ) + random_mode = ParallelMode.WEIGHT_DATA if is_using_isp() else ParallelMode.DATA set_mode(random_mode) # if fsdp enabled, wrap the model @@ -262,9 +259,8 @@ def initialize_isp_communicator(model: Union[nn.Module, nn.ModuleList]): Returns: An isp communicator for managing comp/comm overlap and memory pool. """ - if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": - isp_communicator = None - else: + isp_communicator = None + if is_using_isp(): isp_communicator = ISPCommunicator( model, ISPCommModelConfig( diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 2f57f11a..4980255a 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -5,7 +5,7 @@ from internlm.core.context.parallel_context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc from internlm.model.utils import is_moe_param -from internlm.utils.parallel import is_tensor_data_parallel_parameter +from internlm.utils.parallel import is_tensor_data_parallel_parameter, is_using_isp def split_params_into_different_groups_for_optimizer( @@ -39,7 +39,7 @@ def split_params_into_different_groups_for_optimizer( # create new groups for IS_TENSOR_DATA_PARALLEL parameter group new_groups = {} - if isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): new_groups["embed_head"] = {"name": "embed_head", "params": [], "optimizer_mode": ParallelMode.DATA} # create new groups for fp32 parameter group new_groups["fp32"] = {"name": "fp32", "params": [], "optimizer_mode": ParallelMode.ZERO1} diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 962f6415..9ace19ab 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -30,6 +30,7 @@ from internlm.utils.common import get_current_device from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer +from internlm.utils.parallel import is_using_isp from internlm.utils.storage_manager import ( get_fns, get_storage_manager, @@ -325,7 +326,7 @@ def save_model_checkpoint(folder, model): # even if pp is not considered, it will definitely not be written on the same machine. # for tensor parallel mode with isp - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): if wdp_rank == 0 or dp_rank == 0: fn = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" fp = os.path.join(folder, fn) @@ -564,7 +565,7 @@ def load_model_checkpoint(folder, model): for fn in fns: if fn.startswith("model_t") and not fn.endswith(".md5"): segements = os.path.splitext(fn)[0].split("_") - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): max_pp = max(max_pp, int(segements[-1][2:])) max_wp = max(max_wp, int(segements[-2][2:])) max_tp = max(max_tp, int(segements[-3][2:])) @@ -590,7 +591,7 @@ def load_model_checkpoint(folder, model): dp_size == max_zo + 1 ), f"The weights are save for {max_zo+1} FSDP shards , while current has {dp_size} FSDP shards" - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): should_load_name = f"model_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}.pt" elif gpc.config.parallel.zero1.fsdp: should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_dp{dp_rank}.pt" @@ -702,7 +703,7 @@ def save_optimizer_checkpoint(optim, state_path): states = optim.state_dict() if isinstance(optim, HybridZeroOptimizer): - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" llm_save(os.path.join(state_path, fp), states) else: @@ -752,7 +753,7 @@ def load_optimizer_checkpoint(folder, optim): max_tp, max_wp, max_pp, max_zero, max_dp = 0, 0, 0, 0, 0 for fn in fns: if fn.startswith("optimizer_") and not fn.endswith(".md5"): - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): _, tp, wp, pp, dp = os.path.splitext(fn)[0].split("_") max_dp = max(max_dp, int(dp[2:])) max_tp = max(max_tp, int(tp[2:])) @@ -770,12 +771,12 @@ def load_optimizer_checkpoint(folder, optim): pp_size = gpc.get_world_size(ParallelMode.PIPELINE) dp_size = gpc.get_world_size(ParallelMode.DATA) - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): assert dp_size == max_dp + 1, ( f"The optimizer states are save for {max_dp+1} data parallelism, " f"while current has {dp_size} data parallelism" ) - if gpc.config.parallel["tensor"].get("mode", "mtp") != "isp": + if not is_using_isp(): assert zero_size == max_zero + 1, ( f"The optimizer states are save for {max_zero+1} zero parallel, " f"while current has {zero_size} zero broadcast range." @@ -795,7 +796,7 @@ def load_optimizer_checkpoint(folder, optim): wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT) pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE) dp_rank = gpc.get_local_rank(ParallelMode.DATA) - if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp": + if is_using_isp(): fp = f"optimizer_tp{tp_rank}_wp{wp_rank}_pp{pp_rank}_dp{dp_rank}.pt" else: fp = f"optimizer_tp{tp_rank}_pp{pp_rank}_zo{zero_rank}.pt" diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 5a491d33..76cd8d95 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -20,6 +20,10 @@ RMSNorm = try_import_RMSNorm() +def is_using_isp(): + return isinstance(gpc.config.parallel["tensor"], dict) and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" + + def is_replica_zero_parallel_parameter(p): return hasattr(p, IS_REPLICA_ZERO_PARALLEL) and getattr(p, IS_REPLICA_ZERO_PARALLEL) @@ -27,7 +31,7 @@ def is_replica_zero_parallel_parameter(p): def is_tensor_data_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.TENSOR) - and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" + and is_using_isp() and hasattr(p, IS_TENSOR_DATA_PARALLEL) and getattr(p, IS_TENSOR_DATA_PARALLEL) ) @@ -36,7 +40,7 @@ def is_tensor_data_parallel_parameter(p): def is_tensor_zero_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.TENSOR) - and gpc.config.parallel["tensor"].get("mode", "mtp") != "isp" + and not is_using_isp() and hasattr(p, IS_TENSOR_ZERO_PARALLEL) and getattr(p, IS_TENSOR_ZERO_PARALLEL) ) @@ -45,7 +49,7 @@ def is_tensor_zero_parallel_parameter(p): def is_weight_zero_parallel_parameter(p): return ( gpc.is_initialized(ParallelMode.WEIGHT) - and gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" + and is_using_isp() and hasattr(p, IS_WEIGHT_ZERO_PARALLEL) and getattr(p, IS_WEIGHT_ZERO_PARALLEL) ) @@ -67,9 +71,7 @@ def sync_model_param(model): """ sync_moe_param = gpc.is_using_parallel_mode(ParallelMode.EXPERT_DATA) - sync_parallel_mode = ( - ParallelMode.WEIGHT_DATA if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.DATA - ) + sync_parallel_mode = ParallelMode.WEIGHT_DATA if is_using_isp() else ParallelMode.DATA for param in model.parameters(): if sync_moe_param and getattr(param, "is_expert", False): ranks = gpc.get_ranks_in_group(ParallelMode.EXPERT_DATA) @@ -90,9 +92,7 @@ def sync_model_replica_param_group(model): model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency. """ - parallel_mode = ( - ParallelMode.WEIGHT if gpc.config.parallel["tensor"].get("mode", "mtp") == "isp" else ParallelMode.TENSOR - ) + parallel_mode = ParallelMode.WEIGHT if is_using_isp() else ParallelMode.TENSOR if gpc.is_using_parallel_mode(parallel_mode): for param in model.parameters(): if is_replica_zero_parallel_parameter(param): From f02523edd5f510ba6916c690639c10c7683a54e9 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Mon, 29 Jan 2024 17:05:44 +0800 Subject: [PATCH 146/153] fix(tests): fix ci tests error --- internlm/train/training_internlm.py | 2 +- tests/test_training/test_loss.py | 2 +- tests/test_training/test_swap_nb_loss_and_gradnorm.py | 2 +- tests/test_training/train_CI.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 2fe61b7d..4bcf2e9c 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -78,11 +78,11 @@ from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import ( - is_using_isp, is_replica_zero_parallel_parameter, is_tensor_data_parallel_parameter, is_tensor_expert_data_parallel_parameter, is_tensor_zero_parallel_parameter, + is_using_isp, is_weight_zero_parallel_parameter, set_model_params_layer_name, sync_model_param, diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index 7e694d57..a3b3b442 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -93,7 +93,7 @@ def train( current_time = objs[0] # initialize model - model, _ = initialize_model() + model = initialize_model() # initialize loss function criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing) diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py index 4d8afa28..873d2ff6 100644 --- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py +++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py @@ -278,7 +278,7 @@ def exam_loss(args): seed_all(1024) # initialize model - model, _ = initialize_model() + model = initialize_model() # initialize loss function criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing) diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py index a985b985..39c98781 100644 --- a/tests/test_training/train_CI.py +++ b/tests/test_training/train_CI.py @@ -124,7 +124,7 @@ def main(args): uniscale_logger = initialize_llm_logger(start_time=current_time) # initialize model - model, _ = initialize_model() + model = initialize_model() with open(args.config, "r") as f: config_lines = f.readlines() From 23ab67f0860d4567b469698902864226066f8f12 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 30 Jan 2024 11:07:02 +0800 Subject: [PATCH 147/153] feat(model/modeling_llama.py): update model llama --- internlm/model/modeling_llama.py | 143 +++++++++++++++++++++---------- 1 file changed, 97 insertions(+), 46 deletions(-) diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py index 5f999c8f..00529796 100644 --- a/internlm/model/modeling_llama.py +++ b/internlm/model/modeling_llama.py @@ -9,6 +9,7 @@ from internlm.core.context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc +from internlm.core.naive_amp import set_output_attr_to_module from internlm.initialize.initialize_tensor import ( normal_, scaled_init_method_normal, @@ -17,13 +18,18 @@ ) from internlm.model.embedding import Embedding1D, RotaryEmbedding from internlm.model.linear import ( - ColumnParallelLinearTorch, - FeedForward, + MegatronScaleColumnParallelLinear, RewardModelLinear, - RowParallelLinearTorch, ScaleColumnParallelLinear, + get_linear_cls, + get_mlp_cls, +) +from internlm.model.multi_head_attention import DistributedAttention +from internlm.model.utils import ( + gather_forward_split_backward, + split_forward_gather_backward, + try_import_RMSNorm, ) -from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs @@ -59,24 +65,25 @@ class MHA(nn.Module): Args: embed_dim (int): The dimention of hidden state. num_heads (int): The number of attention heads. - num_kv_heads (int): The number of kv attention heads. process_group (torch.distributed.ProcessGroup): The group of the current device for `parallel_mode`. + sequence_process_group (torch.distributed.ProcessGroup): The process group for attention calculation. bias (boolean): Whether the bias is needed for linears. Will be used when initializing QKV matrix and output projection. True by default. dropout (float): The dropout rate for cross attention and self attention. 0.0 by default. softmax_scale (float): The temperature to use for the softmax attention. causal (boolean): Whether to apply causal attention mask. False by default. layer_idx (int): The index of current layer. None by default. - rope_base (int): The value of `base` for rotary position embeddings. 10000 by default. rotary_emb_dim (int): The dimention of Rotary Embedding. 0 by default. rotary_emb_scale_base (int): The scaling factor of Rotary Embedding. If scale_base > 0, this implements XPos(Sun et al., https://arxiv.org/abs/2212.10554). 0 by default. use_flash_attn (boolean): Whether to use flash attention or not.If False, vanilla attention module will be used. - True by default. + False by default. device (Optional[Union[str, torch.device]]): The device will be used. dtype (Optional[torch.dtype]): The type of data. - rot_embed_HF_impl: rotary embedding hf implementation. False by default. - + use_flash_attn (bool): Whether to use flash-attn. True by default. + rope_base (int): The value of `base` for rotary position embeddings. 10000 by default. + tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"], + "mtp" by default. """ @@ -86,6 +93,7 @@ def __init__( num_heads: int, num_kv_heads: int, process_group: Optional[torch.distributed.ProcessGroup], + sequence_process_group: Optional[torch.distributed.ProcessGroup], bias: bool = True, dropout: float = 0.0, softmax_scale: float = None, @@ -98,6 +106,7 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, rot_embed_HF_impl: Optional[bool] = False, + tp_mode: str = "mtp", ) -> None: factory_kwargs = {"device": device, "dtype": dtype} super().__init__() @@ -113,6 +122,7 @@ def __init__( self.rotary_emb_dim = rotary_emb_dim self.use_flash_attn = use_flash_attn self.dtype = dtype + self.tp_mode = tp_mode self.rot_embed_HF_impl = rot_embed_HF_impl sequence_parallel = gpc.config.parallel.get("sequence_parallel", False) @@ -122,8 +132,9 @@ def __init__( self.rotary_emb_dim, base=rope_base, scale_base=rotary_emb_scale_base, device=device ) + Wqkv_cls = get_linear_cls(self.tp_mode, "column") # notice here should change bias=True - self.wq = ColumnParallelLinearTorch( + self.wq = Wqkv_cls( embed_dim, embed_dim, process_group, @@ -131,7 +142,7 @@ def __init__( sequence_parallel=sequence_parallel, **factory_kwargs, ) - self.wk = ColumnParallelLinearTorch( + self.wk = Wqkv_cls( embed_dim, self.kv_dim, process_group, @@ -139,7 +150,7 @@ def __init__( sequence_parallel=sequence_parallel, **factory_kwargs, ) - self.wv = ColumnParallelLinearTorch( + self.wv = Wqkv_cls( embed_dim, self.kv_dim, process_group, @@ -159,8 +170,13 @@ def __init__( self.inner_cross_attn_softmax_scale = softmax_scale self.inner_cross_attn_dropout = dropout + self.attn = flash_attn_varlen_kvpacked_func + if self.tp_mode == "isp": + self.attn = DistributedAttention(self.attn, sequence_process_group=sequence_process_group) + # output projection always have the bias (for now) - self.wo = RowParallelLinearTorch( + out_proj_cls = get_linear_cls(self.tp_mode, "row") + self.wo = out_proj_cls( embed_dim, embed_dim, process_group, @@ -421,7 +437,7 @@ def _packed_forward(self, x, inference_params=None, **kwargs): if kv.dtype not in [torch.float16, torch.bfloat16]: kv = kv.to(torch.bfloat16) with torch.cuda.amp.autocast(dtype=torch.bfloat16): - context = flash_attn_varlen_kvpacked_func( + context = self.attn( q=q, kv=kv, cu_seqlens_q=kwargs["cu_seqlens"], @@ -433,7 +449,7 @@ def _packed_forward(self, x, inference_params=None, **kwargs): causal=self.inner_cross_attn_causal, ).to(self.dtype) else: - context = flash_attn_varlen_kvpacked_func( + context = self.attn( q=q, kv=kv, cu_seqlens_q=kwargs["cu_seqlens"], @@ -446,7 +462,6 @@ def _packed_forward(self, x, inference_params=None, **kwargs): ) else: raise RuntimeError("Not support this right now") - context = rearrange(context, "b h d -> b (h d)") # recover shape out = self.wo(context) return out @@ -459,7 +474,6 @@ class PackedFlashLlamaLayer1D(nn.Module): Args: hidden_size (int): The hidden size of model. 768 by default. num_attention_heads (int): The number of attention heads. 12 by default. - num_kv_attention_heads (int): The number of kv attention heads. 12 by default. mlp_ratio (int): The ratio of MLP layers. 4 by default. attn_drop_rate (float): The dropout rate of attention module. 0 by default. drop_rate (float): The dropout rate of the input hidden state. 0.0 by default. @@ -469,14 +483,7 @@ class PackedFlashLlamaLayer1D(nn.Module): layer_idx (int): The index of current layer. 0 by default. residual_in_fp32 (bool): Whether to use residual in fp32. False by default. device (Optional[Union[str, torch.device]]): The device will be used. - apply_post_layer_norm (bool): Whether use post layer norm. False by default. - fused_dropout_add_ln (bool): Whether use fused dropout add ln. True by default. - no_bias (bool): Whether remove bias. False by default. norm_type (str): Use RMS norm or layernorm."rmsnorm" by default. - adapt_hf (bool): Whether adapt hf. False by default. - dropout_selective_checkpoint (bool): Whether use dropout selective checkpoint. True by default. - use_scaled_init (bool): Whether use scaled init. True by default. - use_swiglu (bool): Whether use swiglu. True by default. use_flash_attn (bool): Whether use flash-attn. True by default. attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default, attn_other_init_std (float): std used to init attn_other weight. 0.02 by default, @@ -485,6 +492,8 @@ class PackedFlashLlamaLayer1D(nn.Module): ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default, init_type (str): Initialization type. Use uniform or normal. "normal" by default, rope_base (int): The value of `base` for rotary position embeddings. 10000 by default. + tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"], + "mtp" by default. """ def __init__( @@ -516,6 +525,7 @@ def __init__( ffn_other_init_std: float = 0.02, init_type: str = "normal", rope_base: int = 10000, + tp_mode: str = "mtp", ): super().__init__() self.checkpoint = checkpoint @@ -532,11 +542,15 @@ def __init__( self.ffn_other_init_std = ffn_other_init_std head_dim = hidden_size // num_attention_heads + self.tp_mode = tp_mode + parallel_mode = ParallelMode.WEIGHT if self.tp_mode == "isp" else ParallelMode.TENSOR + self.attention = MHA( embed_dim=hidden_size, num_heads=num_attention_heads, num_kv_heads=num_kv_attention_heads, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), + sequence_process_group=gpc.get_group(ParallelMode.TENSOR), dropout=attn_drop_rate, softmax_scale=1 / math.sqrt(head_dim), causal=True, @@ -549,6 +563,7 @@ def __init__( rot_embed_HF_impl=adapt_hf, bias=not no_bias, rope_base=rope_base, + tp_mode=self.tp_mode, ) self.dropout1 = nn.Dropout(drop_rate) @@ -564,11 +579,12 @@ def __init__( sequence_parallel = gpc.config.parallel.get("sequence_parallel", False) if use_swiglu: - self.feed_forward = FeedForward( + mlp_cls = get_mlp_cls(self.tp_mode) + self.feed_forward = mlp_cls( hidden_size, int(hidden_size * mlp_ratio), out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), bias=False, device=device, dtype=dtype, @@ -579,7 +595,7 @@ def __init__( int(hidden_size * mlp_ratio), out_features=hidden_size, activation="gelu_approx", - process_group=gpc.get_group(ParallelMode.TENSOR), + process_group=gpc.get_group(parallel_mode), bias1=False, bias2=False, sequence_parallel=sequence_parallel, @@ -731,7 +747,6 @@ class PackedFlashLlama1D(nn.Module): num_layers (int): The number of layer. 12 by default. hidden_size (int): The size of hidden state. 768 by default. num_attention_heads (int): The number of attention head. 12 by default. - num_kv_attention_heads (int): The number of kv attention head. 12 by default. vocab_size (int): The size of vocabulary. 50304 by default. mlp_ratio (int): The ratio of MLP layers. 4 by default. attn_drop_rate (float): The dropout rate of attention module. 0.0 by default. @@ -749,15 +764,8 @@ class PackedFlashLlama1D(nn.Module): parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default. start_layer_idx (int): The index of start layer in the pipeline. 0 by default. device (Optional[Union[str, torch.device]]): The device will be used. None by default. - apply_post_layer_norm (bool): Whether use post layer norm. False by default. - no_bias (bool): Whether remove bias. False by default. residual_in_fp32 (bool): Whether to use residual in fp32. False by default. norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default. - adapt_hf (bool): Whether adapt hf. False by default. - is_reward (bool): Whether use is_reward. False by default. - dropout_selective_checkpoint (bool): Whether dropout selective checkpoint. True by default. - use_scaled_init (bool): Whether use scaled init. True by default. - use_swiglu (bool): Whether use swiglu. True by default. use_flash_attn (bool): Whether to use flash-attn. True by default. embedding_init_std (float): std used to init embedding weight. 0.02 by default, attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default, @@ -767,6 +775,7 @@ class PackedFlashLlama1D(nn.Module): ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default, out_head_init_std (float): std used to init output lmhead weight. 0.02 by default, init_type (str): Initialization type. Use uniform or normal. "normal" by default, + extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default. rope_base (int): The value of `base` for rotary position embeddings. 10000 by default. """ @@ -808,6 +817,7 @@ def __init__( ffn_other_init_std: float = 0.02, out_head_init_std: float = 0.02, init_type: str = "normal", + extra_pred_tokens: int = 0, rope_base: int = 10000, ): super().__init__() @@ -819,10 +829,18 @@ def __init__( checkpoint_fraction = 0 checkpoint_layer_num = num_layers * checkpoint_fraction sequence_parallel = gpc.config.parallel.get("sequence_parallel", False) + self.tp_mode = "mtp" + if isinstance(gpc.config.parallel["tensor"], dict): + self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp") + if is_reward: head_cls = RewardModelLinear else: - head_cls = ScaleColumnParallelLinear + head_cls = ( + ScaleColumnParallelLinear + if self.tp_mode in ["mtp", "fsp", "isp"] + else MegatronScaleColumnParallelLinear + ) if first: if embed_split_hidden: self.tok_embeddings = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size) @@ -874,6 +892,7 @@ def __init__( ffn_other_init_std=ffn_other_init_std, init_type=init_type, rope_base=rope_base, + tp_mode=self.tp_mode, ) for lid in range(num_layers) ] @@ -895,13 +914,36 @@ def __init__( dtype=dtype, weight_scale=embed_grad_scale, ) - + set_output_attr_to_module(self.output) for _, param in self.output.named_parameters(): if init_type == "normal": normal_(std=out_head_init_std)(param) else: uniform_(std=out_head_init_std)(param) + if extra_pred_tokens > 0: + self.extra_pred_tokens = extra_pred_tokens + assert not is_reward, "extra_pred_tokens > 0 means using multi token prediction, not implement for RLHF" + self.extra_outputs = nn.ModuleList( + [ + head_cls( + in_features=hidden_size, + out_features=vocab_size, + process_group=gpc.get_group(ParallelMode.TENSOR), + bias=False, + device=device, + dtype=dtype, + weight_scale=embed_grad_scale, + ) + for _ in range(self.extra_pred_tokens) + ] + ) + for _, param in self.extra_outputs.named_parameters(): + if init_type == "normal": + normal_(std=out_head_init_std)(param) + else: + uniform_(std=out_head_init_std)(param) + self.parallel_output = parallel_output def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None): @@ -925,6 +967,10 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N assert len(indexes) == 1 # The indexes are used to indicate the actual position IDs of each token in the packed input. indexes = indexes[0] + # if the sequence parallel mode is 'isp', the indexes should also be split in sequence dimension. + if gpc.config.parallel.sequence_parallel and self.tp_mode == "isp": + indexes = split_forward_gather_backward(indexes, ParallelMode.TENSOR, dim=0) + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() if cu_seqlens is not None else None for _, block in enumerate(self.layers): @@ -939,10 +985,16 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N if hasattr(self, "norm"): hidden_states = self.norm(hidden_states.float()) - - extra_hidden_states_list = None + if hasattr(self, "extra_pred_tokens") and self.extra_pred_tokens > 0: + extra_hidden_states_list = [self.extra_outputs[i](hidden_states) for i in range(self.extra_pred_tokens)] + else: + extra_hidden_states_list = None if hasattr(self, "output"): - hidden_states = self.output(hidden_states) + # Evaluation + if gpc.is_evaluating is True: + hidden_states = self.output(hidden_states, gather_dim=1) + else: # Training + hidden_states = self.output(hidden_states, gather_dim=0) if not self.parallel_output: hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) @@ -977,7 +1029,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), logger.info(f"The layer sharding is {all_parts}.") models = [] - kwargs["checkpoint_fraction"] = 1.0 + kwargs["checkpoint_fraction"] = float(kwargs.get("checkpoint", False)) start_idx, end_idx = 0, 0 for start, end in parts: start_idx, end_idx = start, end @@ -1035,6 +1087,7 @@ def build_model_with_cfg( ffn_other_init_std: float = 0.02, out_head_init_std: float = 0.02, init_type: str = "normal", + extra_pred_tokens: int = 0, rope_base: int = 10000, ): """ @@ -1052,18 +1105,14 @@ def build_model_with_cfg( embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default. parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default. num_attention_heads (int): The number of attention head. 32 by default. - num_kv_attention_heads (int): The number of kv attention head. None by default. mlp_ratio (int): The ratio of MLP layers. 4.0 by default. residual_in_fp32 (bool): Whether to use residual in fp32. False by default. It cannot be used temporarily because this parameter requires inconsistent data types to be passed between pipelines, which requires significant modifications to internlm. norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default. - adapt_hf (bool): Whether adapt hf. False by default. drop_rate (float): The dropout rate of input hidden state. 0 by default. attn_drop_rate (float): The dropout rate of attention module. 0 by default. apply_post_layer_norm (bool): Whether to apply post layer norm. False by default. - no_bias (bool): Whether remove bias. False by default. - deepnorm (bool): Whether us deepnorm. False by default. layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default. is_reward (bool): Whether to use reward model. False by default. dropout_selective_checkpoint (bool): It can only be enabled when checkpoint is disabled. True by default. @@ -1078,6 +1127,7 @@ def build_model_with_cfg( ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default, out_head_init_std (float): std used to init output lmhead weight. 0.02 by default, init_type (str): Initialization type. Use uniform or normal. "normal" by default, + extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default. rope_base (int): The value of `base` for rotary position embeddings. 10000 by default. """ if deepnorm: @@ -1114,6 +1164,7 @@ def build_model_with_cfg( ffn_other_init_std=ffn_other_init_std, out_head_init_std=out_head_init_std, init_type=init_type, + extra_pred_tokens=extra_pred_tokens, rope_base=rope_base, ) From f11422e2d07c463f690212a85370914c7f72e436 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 30 Jan 2024 14:37:52 +0800 Subject: [PATCH 148/153] feat(model/utils.py): simplify code --- internlm/model/linear.py | 18 ++-- internlm/model/utils.py | 224 +++++++++------------------------------ 2 files changed, 60 insertions(+), 182 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 9506f608..9d77bb34 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -12,9 +12,9 @@ from internlm.core.context import global_context as gpc from internlm.model.utils import ( Silu, - fused_dense_func_torch, + fused_dense_func, isp_fused_dense_func, - megatron_fused_dense_func_torch, + megatron_fused_dense_func, ) @@ -67,7 +67,7 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() else: weight = self.weight - return fused_dense_func_torch( + return fused_dense_func( input, weight, self.bias, @@ -90,7 +90,7 @@ def forward(self, input, gather_dim=0): # pylint: disable=W0622 weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() else: weight = self.weight - return megatron_fused_dense_func_torch( + return megatron_fused_dense_func( input, weight, self.bias, @@ -140,7 +140,7 @@ def forward(self, input): # pylint: disable=W0622 weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach() else: weight = self.weight - return fused_dense_func_torch( + return fused_dense_func( input, weight, self.bias, @@ -154,7 +154,7 @@ def forward(self, x, gather_dim=0): # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. # If not, then the input is already gathered. - return fused_dense_func_torch( + return fused_dense_func( x, self.weight, self.bias, @@ -169,7 +169,7 @@ def forward(self, x, gather_dim=0): # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: # we do an all_gather of x before doing the matmul. # If not, then the input is already gathered. - return megatron_fused_dense_func_torch( + return megatron_fused_dense_func( x, self.weight, self.bias, @@ -185,7 +185,7 @@ def forward(self, x): We're doing Tensor Parallel with sequence parallelism: we do the matmul and then a reduce_scatter of the result. """ - out = fused_dense_func_torch(x, self.weight, self.bias) + out = fused_dense_func(x, self.weight, self.bias) reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce return reduce_fn(out, self.process_group) @@ -196,7 +196,7 @@ def forward(self, x): We're doing Tensor Parallel with sequence parallelism: we do the matmul and then a reduce_scatter of the result. """ - out = megatron_fused_dense_func_torch(x, self.weight, self.bias) + out = megatron_fused_dense_func(x, self.weight, self.bias) reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce return reduce_fn(out, self.process_group) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index c6ae7002..c79a04fc 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -184,6 +184,7 @@ def forward( process_group=None, sequence_parallel=True, gather_dim=0, + is_using_cuda: bool = True, ): """ If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel @@ -194,6 +195,7 @@ def forward( ctx.process_group = process_group ctx.sequence_parallel = sequence_parallel ctx.gather_dim = gather_dim + ctx.is_using_cuda = is_using_cuda if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -233,6 +235,8 @@ def backward(ctx, grad_output, *args): sequence_parallel = ctx.sequence_parallel gather_dim = ctx.gather_dim + backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch + if ctx.compute_weight_gradient: x, weight = ctx.saved_tensors if process_group is not None and sequence_parallel: @@ -264,7 +268,7 @@ def backward(ctx, grad_output, *args): assert ctx.compute_weight_gradient if process_group is not None and sequence_parallel: handle_x.wait() - grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + grad_weight, grad_bias = backward_func( total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2], @@ -274,7 +278,7 @@ def backward(ctx, grad_output, *args): grad_bias = grad_output if ctx.needs_input_grad[2] else None if process_group is not None and ctx.needs_input_grad[0]: handle_grad_input.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None + return grad_input, grad_weight, grad_bias, None, None, None, None, None class MegatronFusedDenseFunc(torch.autograd.Function): @@ -295,6 +299,7 @@ def forward( process_group=None, sequence_parallel=True, gather_dim=0, + is_using_cuda: bool = True, ): """ If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel @@ -304,6 +309,7 @@ def forward( ctx.return_residual = return_residual ctx.process_group = process_group ctx.sequence_parallel = sequence_parallel + ctx.is_using_cuda = is_using_cuda if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -342,115 +348,8 @@ def backward(ctx, grad_output, *args): process_group = ctx.process_group sequence_parallel = ctx.sequence_parallel - if ctx.compute_weight_gradient: - total_x, weight = ctx.saved_tensors - else: - (weight,) = ctx.saved_tensors - total_x = None - batch_shape = grad_output.shape[:-1] - batch_dim = batch_shape.numel() - grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - if ctx.needs_input_grad[0]: - if not ctx.return_residual: - grad_input = F.linear(grad_output, weight.t()) - else: - grad_input = torch.addmm( - grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, - weight, - ) - grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - if process_group is not None: - reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw - grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) - else: - grad_input = None - if ctx.needs_input_grad[1]: - assert ctx.compute_weight_gradient - grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( - total_x.reshape(batch_dim, total_x.shape[-1]), - grad_output, - ctx.needs_input_grad[2], - ) - else: - grad_weight = None - grad_bias = grad_output if ctx.needs_input_grad[2] else None - if process_group is not None and ctx.needs_input_grad[0]: - handle_grad_input.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None - - -# adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py -class FusedDenseFuncTorch(FusedDenseFunc): - """FusedDenseFunc in flash implementation for supporting torch.float32""" - - @staticmethod - @custom_bwd - def backward(ctx, grad_output, *args): - grad_output = grad_output.contiguous() - if ctx.return_residual: - (grad_input,) = args - grad_input = grad_input.contiguous() - process_group = ctx.process_group - sequence_parallel = ctx.sequence_parallel - gather_dim = ctx.gather_dim - if ctx.compute_weight_gradient: - x, weight = ctx.saved_tensors - if process_group is not None and sequence_parallel: - total_x, handle_x = all_gather_raw(x, process_group, async_op=True, gather_dim=gather_dim) - else: - total_x = x - else: - (weight,) = ctx.saved_tensors - total_x = None - batch_shape = grad_output.shape[:-1] - batch_dim = batch_shape.numel() - grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) - if ctx.needs_input_grad[0]: - if not ctx.return_residual: - grad_input = F.linear(grad_output, weight.t()) - else: - grad_input = torch.addmm( - grad_input.reshape(batch_dim, grad_input.shape[-1]), - grad_output, - weight, - ) - grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) - if process_group is not None: - reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw - grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) - else: - grad_input = None - if ctx.needs_input_grad[1]: - assert ctx.compute_weight_gradient - if process_group is not None and sequence_parallel: - handle_x.wait() - # we remove the cuda independence, which is different from flash_attn. - grad_weight, grad_bias = linear_bias_wgrad_torch( - total_x.reshape(batch_dim, total_x.shape[-1]), - grad_output, - ctx.needs_input_grad[2], - ) - else: - grad_weight = None - grad_bias = grad_output if ctx.needs_input_grad[2] else None - if process_group is not None and ctx.needs_input_grad[0]: - handle_grad_input.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None - - -class MegatronFusedDenseFuncTorch(FusedDenseFunc): - """FusedDenseFunc in megatron implementation for supporting torch.float32""" + backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch - @staticmethod - @custom_bwd - def backward(ctx, grad_output, *args): - grad_output = grad_output.contiguous() - if ctx.return_residual: - (grad_input,) = args - grad_input = grad_input.contiguous() - process_group = ctx.process_group - sequence_parallel = ctx.sequence_parallel if ctx.compute_weight_gradient: total_x, weight = ctx.saved_tensors else: @@ -476,8 +375,7 @@ def backward(ctx, grad_output, *args): grad_input = None if ctx.needs_input_grad[1]: assert ctx.compute_weight_gradient - # we remove the cuda independence, which is different from flash_attn. - grad_weight, grad_bias = linear_bias_wgrad_torch( + grad_weight, grad_bias = backward_func( total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2], @@ -487,7 +385,7 @@ def backward(ctx, grad_output, *args): grad_bias = grad_output if ctx.needs_input_grad[2] else None if process_group is not None and ctx.needs_input_grad[0]: handle_grad_input.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None + return grad_input, grad_weight, grad_bias, None, None, None, None, None class ISPFusedDenseFunc(torch.autograd.Function): @@ -503,13 +401,13 @@ def forward( module, communicator, return_residual=False, - use_flash_attn: bool = True, + is_using_cuda: bool = True, ): ctx.compute_weight_gradient = weight.requires_grad ctx.return_residual = return_residual ctx.module = module ctx.communicator = communicator - ctx.use_flash_attn = use_flash_attn + ctx.is_using_cuda = is_using_cuda if torch.is_autocast_enabled(): x = x.to(dtype=torch.get_autocast_gpu_dtype()) @@ -547,7 +445,7 @@ def backward(ctx, grad_output, *args): module = ctx.module communicator = ctx.communicator - backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.use_flash_attn else linear_bias_wgrad_torch + backward_func = fused_dense_cuda.linear_bias_wgrad if ctx.is_using_cuda else linear_bias_wgrad_torch grad_output = grad_output.contiguous() if ctx.return_residual: @@ -606,10 +504,10 @@ def backward(ctx, grad_output, *args): if grad_bias is not None and grad_bias_sync is not None: grad_bias_sync.wait() - return grad_input, grad_weight, grad_bias, None, None, None, None, None, None + return grad_input, grad_weight, grad_bias, None, None, None, None -def fused_dense_func_torch( +def fused_dense_func( x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, @@ -621,29 +519,20 @@ def fused_dense_func_torch( dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) - if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return FusedDenseFunc.apply( - x, - weight, - bias, - return_residual, - process_group, - sequence_parallel, - gather_dim, - ) - else: - return FusedDenseFuncTorch.apply( - x, - weight, - bias, - return_residual, - process_group, - sequence_parallel, - gather_dim, - ) - - -def megatron_fused_dense_func_torch( + is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible + return FusedDenseFunc.apply( + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, + is_using_cuda, + ) + + +def megatron_fused_dense_func( x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, @@ -655,26 +544,17 @@ def megatron_fused_dense_func_torch( dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) - if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return MegatronFusedDenseFunc.apply( - x, - weight, - bias, - return_residual, - process_group, - sequence_parallel, - gather_dim, - ) - else: - return MegatronFusedDenseFuncTorch.apply( - x, - weight, - bias, - return_residual, - process_group, - sequence_parallel, - gather_dim, - ) + is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible + return MegatronFusedDenseFunc.apply( + x, + weight, + bias, + return_residual, + process_group, + sequence_parallel, + gather_dim, + is_using_cuda, + ) def isp_fused_dense_func( @@ -688,18 +568,16 @@ def isp_fused_dense_func( dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( x.dtype == torch.float32 and torch.is_autocast_enabled() ) - if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: - return ISPFusedDenseFunc.apply(x, weight, bias, module, communicator, return_residual) - else: - return ISPFusedDenseFunc.apply( - x, - weight, - bias, - module, - communicator, - return_residual, - use_flash_attn=False, - ) + is_using_cuda = x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible + return ISPFusedDenseFunc.apply( + x, + weight, + bias, + module, + communicator, + return_residual, + is_using_cuda, + ) def try_import_RMSNorm(): From 8e1ee6fc28d74684dd93eb3e3302b9c5ea7cff2d Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 30 Jan 2024 16:45:07 +0800 Subject: [PATCH 149/153] feat(model/linear.py): update FeedForward class to internlm2 --- internlm/model/linear.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 9d77bb34..9ce91632 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Optional +from typing import Callable, Optional import torch from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear @@ -215,6 +215,8 @@ class BaseFeedForward(nn.Module): device (Optional[Union[str, torch.device]]): The device will be used. dtype (Optional[torch.dtype]): The type of data. multiple_of (int): For efficient training. Reset the size of hidden feature. 256 by default. + column_cls (Optional[Callable]): The column parallel class for w1 and w3. None by default. + row_cls (Optional[Callable]): The row parallel class for w2. None by default. """ def __init__( @@ -227,13 +229,13 @@ def __init__( device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, multiple_of: int = 256, - colum_cls=None, - row_cls=None, + column_cls: Optional[Callable] = None, + row_cls: Optional[Callable] = None, ): super().__init__() hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of) - self.w1 = colum_cls( + self.w1 = column_cls( in_features, hidden_features, process_group, @@ -242,20 +244,20 @@ def __init__( device=device, dtype=dtype, ) - self.w2 = colum_cls( - in_features, + self.w2 = row_cls( hidden_features, + out_features, process_group, - bias, + bias=bias, sequence_parallel=gpc.config.parallel.sequence_parallel, device=device, dtype=dtype, ) - self.w3 = row_cls( + self.w3 = column_cls( + in_features, hidden_features, - out_features, process_group, - bias=bias, + bias, sequence_parallel=gpc.config.parallel.sequence_parallel, device=device, dtype=dtype, @@ -263,8 +265,8 @@ def __init__( def forward(self, x): w1_o = self.w1(x) - w2_o = self.w2(x) - out = self.w3(Silu(w1_o, w2_o)) + w3_o = self.w3(x) + out = self.w2(Silu(w1_o, w3_o)) return out From d7928a690a47e2b27c05a0a33fc83adbd91970f5 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 30 Jan 2024 17:24:13 +0800 Subject: [PATCH 150/153] fix(parallel_context.py): fix private repo ci tests error --- internlm/core/context/parallel_context.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index d597575c..4141b011 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -498,8 +498,10 @@ def init_parallel_groups(self): # the user should not set the data parallel size manually # instead, it should be calculated based on other parallel config self.sequence_parallel_size = self.tensor_parallel_size - self.data_parallel_size = self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size - self.weight_data_parallel_size = self.world_size // self.pipeline_parallel_size // self.weight_parallel_size + self.data_parallel_size = max(1, self.world_size // self.pipeline_parallel_size // self.sequence_parallel_size) + self.weight_data_parallel_size = max( + 1, self.world_size // self.pipeline_parallel_size // self.weight_parallel_size + ) if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp": if self.zero1_parallel_size == -1: self.zero1_parallel_size = self.weight_data_parallel_size From 1960dc0d151127abebcdb29e197b108f0d6bc500 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 30 Jan 2024 18:19:28 +0800 Subject: [PATCH 151/153] feat(parallel_context.py): set zero1 parallel size >= 1 --- internlm/core/context/parallel_context.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 4141b011..b1c7034d 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -505,6 +505,7 @@ def init_parallel_groups(self): if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp": if self.zero1_parallel_size == -1: self.zero1_parallel_size = self.weight_data_parallel_size + self.zero1_parallel_size = max(1, self.zero1_parallel_size) assert ( self.zero1_parallel_size <= self.weight_data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than wdp_size:{self.weight_data_parallel_size}" @@ -515,6 +516,7 @@ def init_parallel_groups(self): else: if self.zero1_parallel_size == -1: self.zero1_parallel_size = self.data_parallel_size + self.zero1_parallel_size = max(1, self.zero1_parallel_size) assert ( self.zero1_parallel_size <= self.data_parallel_size ), f"zero1_size:{self.zero1_parallel_size} should be less than dp_size:{self.data_parallel_size}" From 62a665d6797cf026faf80383b454a14b62508d3f Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 31 Jan 2024 15:39:10 +0800 Subject: [PATCH 152/153] feat(tests): add e2e test case for isp and enable pytorch expandable_segments --- .github/workflows/e2e_test.yaml | 17 ++- configs/7B_isp_sft.py | 200 +++++++++++++++++++++++++++++++ configs/7B_sft.py | 2 +- internlm/initialize/launch.py | 3 + internlm/utils/common.py | 12 ++ tests/test_training/test_loss.py | 24 +++- train.py | 4 +- 7 files changed, 257 insertions(+), 5 deletions(-) create mode 100644 configs/7B_isp_sft.py diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5c7b4430..965905c7 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -23,4 +23,19 @@ jobs: - name: training_8GPU run: | source $evo_env - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training/test_loss.py + + training_8GPU_ISP: + runs-on: [t_cluster] + timeout-minutes: 10 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + - uses: actions/checkout@v3 + + - name: training_8GPU_ISP + run: | + source $evo_env + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py new file mode 100644 index 00000000..9aac5557 --- /dev/null +++ b/configs/7B_isp_sft.py @@ -0,0 +1,200 @@ +JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 2048 +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internlm", "llama", "hf_llama". + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = None # "/path/to/dataset" +VALID_FOLDER = None # "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=50000, + skip_batches="", + # rampup_batch_size (str): A string with three space-separated integers representing the + # starting batch size, the increment, and the number of steps between + # each increment. For example, "192 24 8" means that the batch size (micro_num) + # starts at 192 and increases by 24 every 8 steps. Defaults to None. + # (IMPORTANT): The interval step size is 'micro_bsz'. + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + train_folder=TRAIN_FOLDER, + valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=200, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=False, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +use_fp32_norm = False +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel (dict): + 1. size: int + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. +tensor parallel (dict): + 1. size: int, the size of tensor parallel. + 2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'], + defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel. + msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size. + fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size. + isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, + defaults to False. +weight parallel (dict): + 1. size: int, the size of weight parallel. + 2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False. + 3. memory_pool: bool, enable/disable memory pool, defaults to False. +""" +parallel = dict( + zero1=dict(size=-1), + tensor=dict(size=2, mode="isp"), + pipeline=dict(size=1, interleaved_overlap=True), + weight=dict(size=4, overlap=True, memory_pool=True), +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", + ), + tensorboard=dict( + queue_max_length=10, + ), +) + +# metric_dtype can be "fp32" or other string +# only when set to "fp32" will use fp32 to calc in metrics +# metric_dtype = "fp32" diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 615cd6c3..577fc93c 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -173,7 +173,7 @@ 3. memory_pool: bool, enable/disable memory pool, defaults to False. """ parallel = dict( - zero1=dict(size=8, fsdp=False), + zero1=dict(size=8), tensor=dict(size=1, mode="mtp"), pipeline=dict(size=1, interleaved_overlap=True), weight=dict(size=1, overlap=True, memory_pool=True), diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 61e2eeb4..443a53d6 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -325,6 +325,9 @@ def args_sanity_check(): gpc.config.parallel["tensor"]["mode"] = "mtp" if gpc.config.parallel["tensor"]["mode"] == "isp": assert not gpc.config.parallel.zero1.fsdp, "FSDP does not support isp" + assert ( + torch.__version__ >= "2.1.0" + ), f"requires torch>=2.1.0 when using isp but current version is {torch.__version__}" assert gpc.config.parallel["tensor"].get("mode", None) in [ "mtp", "msp", diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 39e2d902..e759e013 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -14,8 +14,10 @@ import torch import internlm +from internlm.utils.logger import get_logger CURRENT_TIME = None +logger = get_logger(__file__) def parse_args(): @@ -232,6 +234,16 @@ def get_megatron_flops( return tflops +def enable_pytorch_expandable_segments(): + if torch.__version__ >= "2.1.0": + _alloc_setting = "expandable_segments:True" + if os.getenv("PYTORCH_CUDA_ALLOC_CONF", None) is not None: + _alloc_setting = os.getenv("PYTORCH_CUDA_ALLOC_CONF") + "," + _alloc_setting + torch.cuda.memory._set_allocator_settings(_alloc_setting) + else: + logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.") + + class DummyProfile: """ Dummy Profile. diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index a3b3b442..95079812 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -17,7 +17,9 @@ get_train_data_loader, initialize_model, initialize_optimizer, + initialize_isp_communicator, load_new_batch, + get_scheduler_hooks, ) from internlm.utils.common import BatchSkipper, launch_time from internlm.utils.gputest import empty_cache_and_diag @@ -46,6 +48,7 @@ def train( dp_size: int = 1, tp_size: int = 1, + wp_size: int = 1, pp_size: int = 1, num_chunks: int = 2, interleaved: bool = False, @@ -62,6 +65,9 @@ def train( assert ( gpc.get_world_size(ParallelMode.TENSOR) == tp_size ), f"tensor parallel size: {gpc.get_world_size(ParallelMode.TENSOR)} is not as expected {tp_size}" + assert ( + gpc.get_world_size(ParallelMode.WEIGHT) == wp_size + ), f"weight parallel size: {gpc.get_world_size(ParallelMode.WEIGHT)} is not as expected {wp_size}" assert ( gpc.get_world_size(ParallelMode.PIPELINE) == pp_size ), f"pipeline parallel size: {gpc.get_world_size(ParallelMode.PIPELINE)} is not as expected {pp_size}" @@ -95,6 +101,9 @@ def train( # initialize model model = initialize_model() + # initialize isp communicator + isp_communicator = initialize_isp_communicator(model) + # initialize loss function criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing) @@ -104,7 +113,7 @@ def train( # initialize and resume train state train_state = TrainState(gpc.config, train_dl.batch_sampler) - optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) + optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator) with open(CONFIG_FILE_PATH, "r") as f: config_lines = f.readlines() @@ -143,6 +152,7 @@ def train( ), ] + # initialize trainer trainer, train_dl, _, _ = internlm.initialize_trainer( model=model, optimizer=optimizer, @@ -150,7 +160,7 @@ def train( train_dataloader=train_dl, lr_scheduler=lr_scheduler, beta2_scheduler=beta2_scheduler, - scheduler_hooks=scheduler_hooks, + scheduler_hooks=get_scheduler_hooks(metric, optimizer, isp_communicator), ) # initialize the batch skipper @@ -291,3 +301,13 @@ def test_training_loss_with_dp8_pp2_interleaved_overlap(): check_loss_spike() check_loss_accuracy() + + +@pytest.mark.training_8GPU_ISP +def test_training_with_isp(): + # update config file + global CONFIG_FILE_PATH + CONFIG_FILE_PATH = "./configs/7B_isp_sft.py" + + # model training + train(dp_size=4, tp_size=2, wp_size=4, enable_sp=True) diff --git a/train.py b/train.py index 150f5463..490894a9 100644 --- a/train.py +++ b/train.py @@ -31,6 +31,7 @@ ) from internlm.utils.common import ( BatchSkipper, + enable_pytorch_expandable_segments, get_megatron_flops, launch_time, parse_args, @@ -69,6 +70,8 @@ def initialize_llm_logger(start_time: str): def main(args): + enable_pytorch_expandable_segments() + # init setting skip_batches = gpc.config.data.skip_batches total_steps = gpc.config.data.total_steps @@ -155,7 +158,6 @@ def main(args): ) # initialize trainer - trainer, train_dl, _, _ = internlm.initialize_trainer( model=model, optimizer=optimizer, From e91acb47a4c5456092d20f4ff6d62ea9b22ab586 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 31 Jan 2024 16:59:43 +0800 Subject: [PATCH 153/153] feat(doc): update doc torch and flashattn version --- .github/workflows/e2e_test.yaml | 2 ++ doc/en/install.md | 10 +++++----- doc/install.md | 10 +++++----- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 965905c7..a9f6cdc1 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -38,4 +38,6 @@ jobs: - name: training_8GPU_ISP run: | source $evo_env + conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2 + conda activate /mnt/petrelfs/share_data/huangting.p/envs/llm-torch2.1-flash2 srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py diff --git a/doc/en/install.md b/doc/en/install.md index f1f3abd7..57df69cf 100644 --- a/doc/en/install.md +++ b/doc/en/install.md @@ -5,17 +5,17 @@ The required packages and corresponding version are shown as follows: - Python == 3.10 - GCC == 10.2.0 - MPFR == 4.1.0 -- CUDA >= 11.7 -- Pytorch >= 1.13.1 +- CUDA >= 11.8 +- Pytorch >= 2.1.0 - Transformers >= 4.28.0 -- Flash-Attention >= v1.0.5 +- Flash-Attention >= v2.2.1 - Apex == 23.05 - GPU with Ampere or Hopper architecture (such as H100, A100) - Linux OS After installing the above dependencies, some system environment variables need to be updated: ```bash -export CUDA_PATH={path_of_cuda_11.7} +export CUDA_PATH={path_of_cuda_11.8} export GCC_HOME={path_of_gcc_10.2.0} export MPFR_HOME={path_of_mpfr_4.1.0} export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH @@ -47,7 +47,7 @@ pip install -r requirements/torch.txt pip install -r requirements/runtime.txt ``` -Install flash-attention (version v1.0.5): +Install flash-attention (version v2.2.1): ```bash cd ./third_party/flash-attention python setup.py install diff --git a/doc/install.md b/doc/install.md index 3016457d..f6a8588d 100644 --- a/doc/install.md +++ b/doc/install.md @@ -5,17 +5,17 @@ - Python == 3.10 - GCC == 10.2.0 - MPFR == 4.1.0 -- CUDA >= 11.7 -- Pytorch >= 1.13.1 +- CUDA >= 11.8 +- Pytorch >= 2.1.0 - Transformers >= 4.28.0 -- Flash-Attention >= v1.0.5 +- Flash-Attention >= v2.2.1 - Apex == 23.05 - Ampere或者Hopper架构的GPU (例如H100, A100) - Linux OS 以上依赖包安装完成后,需要更新配置系统环境变量: ```bash -export CUDA_PATH={path_of_cuda_11.7} +export CUDA_PATH={path_of_cuda_11.8} export GCC_HOME={path_of_gcc_10.2.0} export MPFR_HOME={path_of_mpfr_4.1.0} export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH @@ -46,7 +46,7 @@ pip install -r requirements/torch.txt pip install -r requirements/runtime.txt ``` -安装 flash-attention (version v1.0.5): +安装 flash-attention (version v2.2.1): ```bash cd ./third_party/flash-attention python setup.py install